Merge branch 'RED-9353' into 'main'
RED-9353: refactor PDFTronViewerDocumentService See merge request fforesight/layout-parser!178
This commit is contained in:
commit
033279e261
@ -116,29 +116,14 @@ public class LayoutParsingPipeline {
|
||||
log.info("Starting layout parsing for {}", layoutParsingRequest.identifier());
|
||||
|
||||
File originFile = layoutParsingStorageService.getOriginFile(layoutParsingRequest.originFileStorageId());
|
||||
// File viewerDocumentFile = layoutParsingStorageService.getViewerDocFile(layoutParsingRequest.viewerDocumentStorageId()).orElse(originFile);
|
||||
File viewerDocumentFile = originFile;
|
||||
File viewerDocumentFile = layoutParsingStorageService.getViewerDocFile(layoutParsingRequest.viewerDocumentStorageId()).orElse(originFile);
|
||||
|
||||
VisualLayoutParsingResponse visualLayoutParsingResponse = new VisualLayoutParsingResponse();
|
||||
if (layoutParsingRequest.visualLayoutParsingFileId()
|
||||
.isPresent()) {
|
||||
visualLayoutParsingResponse = layoutParsingStorageService.getVisualLayoutParsingFile(layoutParsingRequest.visualLayoutParsingFileId()
|
||||
.get());
|
||||
}
|
||||
|
||||
ImageServiceResponse imageServiceResponse = new ImageServiceResponse();
|
||||
if (layoutParsingRequest.imagesFileStorageId()
|
||||
.isPresent()) {
|
||||
imageServiceResponse = layoutParsingStorageService.getImagesFile(layoutParsingRequest.imagesFileStorageId()
|
||||
.get());
|
||||
}
|
||||
|
||||
TableServiceResponse tableServiceResponse = new TableServiceResponse();
|
||||
if (layoutParsingRequest.tablesFileStorageId()
|
||||
.isPresent()) {
|
||||
tableServiceResponse = layoutParsingStorageService.getTablesFile(layoutParsingRequest.tablesFileStorageId()
|
||||
.get());
|
||||
}
|
||||
VisualLayoutParsingResponse visualLayoutParsingResponse = layoutParsingRequest.visualLayoutParsingFileId()
|
||||
.map(layoutParsingStorageService::getVisualLayoutParsingFile).orElse(new VisualLayoutParsingResponse());
|
||||
ImageServiceResponse imageServiceResponse = layoutParsingRequest.imagesFileStorageId()
|
||||
.map(layoutParsingStorageService::getImagesFile).orElse(new ImageServiceResponse());
|
||||
TableServiceResponse tableServiceResponse = layoutParsingRequest.tablesFileStorageId()
|
||||
.map(layoutParsingStorageService::getTablesFile).orElse(new TableServiceResponse());
|
||||
|
||||
ClassificationDocument classificationDocument = parseLayout(settings.getLayoutParsingTypeOverride() == null //
|
||||
? layoutParsingRequest.layoutParsingType() : settings.getLayoutParsingTypeOverride(),
|
||||
@ -151,16 +136,12 @@ public class LayoutParsingPipeline {
|
||||
log.info("Building document graph for {}", layoutParsingRequest.identifier());
|
||||
|
||||
Document documentGraph = observeBuildDocumentGraph(settings.getLayoutParsingTypeOverride() == null //
|
||||
? layoutParsingRequest.layoutParsingType() : settings.getLayoutParsingTypeOverride(), classificationDocument);
|
||||
? layoutParsingRequest.layoutParsingType() : settings.getLayoutParsingTypeOverride(),
|
||||
classificationDocument);
|
||||
|
||||
log.info("Creating viewer document for {}", layoutParsingRequest.identifier());
|
||||
|
||||
layoutGridService.addLayoutGrid(viewerDocumentFile,
|
||||
documentGraph,
|
||||
viewerDocumentFile,
|
||||
false,
|
||||
layoutParsingRequest.visualLayoutParsingFileId()
|
||||
.isPresent());
|
||||
layoutGridService.addLayoutGrid(viewerDocumentFile, documentGraph, viewerDocumentFile, false, layoutParsingRequest.visualLayoutParsingFileId().isPresent());
|
||||
|
||||
log.info("Storing resulting files for {}", layoutParsingRequest.identifier());
|
||||
|
||||
@ -249,10 +230,11 @@ public class LayoutParsingPipeline {
|
||||
Map<Integer, List<TableCells>> pdfTableCells = cvTableParsingAdapter.buildCvParsedTablesPerPage(tableServiceResponse);
|
||||
Map<Integer, List<ClassifiedImage>> pdfImages = imageServiceResponseAdapter.buildClassifiedImagesPerPage(imageServiceResponse);
|
||||
Map<Integer, List<ClassifiedImage>> signatures = visualLayoutParsingAdapter.buildExtractedSignaturesPerPage(visualLayoutParsingResponse);
|
||||
|
||||
ClassificationDocument classificationDocument = new ClassificationDocument();
|
||||
|
||||
if (settings.isDebug() || identifier.containsKey("debug")) {
|
||||
classificationDocument.getVisualizations().setActive(true);
|
||||
classificationDocument.getLayoutDebugLayer().setActive(true);
|
||||
}
|
||||
|
||||
List<ClassificationPage> classificationPages = new ArrayList<>();
|
||||
@ -290,7 +272,7 @@ public class LayoutParsingPipeline {
|
||||
}
|
||||
stripper.getText(originDocument);
|
||||
List<TextPositionSequence> words = stripper.getTextPositionSequences();
|
||||
classificationDocument.getVisualizations().addTextVisualizations(words, pageNumber);
|
||||
classificationDocument.getLayoutDebugLayer().addTextVisualizations(words, pageNumber);
|
||||
|
||||
PDRectangle pdr = pdPage.getMediaBox();
|
||||
|
||||
@ -298,32 +280,34 @@ public class LayoutParsingPipeline {
|
||||
boolean isLandscape = pdr.getWidth() > pdr.getHeight() && (rotation == 0 || rotation == 180) || pdr.getHeight() > pdr.getWidth() && (rotation == 90 || rotation == 270);
|
||||
|
||||
PDRectangle cropbox = pdPage.getCropBox();
|
||||
classificationDocument.getVisualizations().addRulingVisualization(stripper.getRulings(), pageNumber);
|
||||
classificationDocument.getLayoutDebugLayer().addRulingVisualization(stripper.getRulings(), pageNumber);
|
||||
CleanRulings cleanRulings = rulingCleaningService.deduplicateAndStraightenRulings(pdfTableCells.get(pageNumber), stripper.getRulings());
|
||||
|
||||
PageInformation pageInformation = PageInformation.fromPDPage(pageNumber, pdPage);
|
||||
List<Cell> emptyTableCells = TableExtractionService.findCells(cleanRulings.getHorizontals(), cleanRulings.getVerticals(), pageInformation);
|
||||
classificationDocument.getVisualizations().addCellVisualizations(emptyTableCells, pageNumber);
|
||||
classificationDocument.getLayoutDebugLayer().addCellVisualizations(emptyTableCells, pageNumber);
|
||||
|
||||
TextRulingsClassifier.classifyUnderlinedAndStrikethroughText(words, cleanRulings);
|
||||
|
||||
List<Box> graphics = graphicExtractorService.extractPathElementGraphics(originDocument, pdPage, pageNumber, cleanRulings, stripper.getTextPositionSequences(),
|
||||
|
||||
false);
|
||||
List<Box> graphics = graphicExtractorService.extractPathElementGraphics(originDocument, pdPage, pageNumber, cleanRulings, stripper.getTextPositionSequences(), false);
|
||||
|
||||
pdfImages.computeIfAbsent(pageNumber, x -> new ArrayList<>())
|
||||
.addAll(graphics.stream()
|
||||
.map(g -> new ClassifiedImage(new Rectangle2D.Double(g.x1, g.y1, g.width(), g.height()), ImageType.GRAPHIC, false, stripper.getPageNumber(), ""))
|
||||
.map(g -> new ClassifiedImage(new Rectangle2D.Double(g.x1, g.y1, g.width(), g.height()),
|
||||
ImageType.GRAPHIC,
|
||||
false,
|
||||
stripper.getPageNumber(),
|
||||
""))
|
||||
.toList());
|
||||
|
||||
ClassificationPage classificationPage = switch (layoutParsingType) {
|
||||
case REDACT_MANAGER_OLD ->
|
||||
redactManagerBlockificationService.blockify(stripper.getTextPositionSequences(), cleanRulings, classificationDocument.getVisualizations());
|
||||
redactManagerBlockificationService.blockify(stripper.getTextPositionSequences(), cleanRulings, classificationDocument.getLayoutDebugLayer());
|
||||
case DOCUMINE_OLD -> docuMineBlockificationService.blockify(words, cleanRulings);
|
||||
case DOCUMINE, REDACT_MANAGER, REDACT_MANAGER_PARAGRAPH_DEBUG, REDACT_MANAGER_WITHOUT_DUPLICATE_PARAGRAPH ->
|
||||
docstrumBlockificationService.blockify(words, cleanRulings, true, classificationDocument.getVisualizations(), layoutParsingType);
|
||||
docstrumBlockificationService.blockify(words, cleanRulings, true, classificationDocument.getLayoutDebugLayer(), layoutParsingType);
|
||||
case CLARIFYND, CLARIFYND_PARAGRAPH_DEBUG ->
|
||||
docstrumBlockificationService.blockify(words, cleanRulings, false, classificationDocument.getVisualizations(), layoutParsingType);
|
||||
docstrumBlockificationService.blockify(words, cleanRulings, false, classificationDocument.getLayoutDebugLayer(), layoutParsingType);
|
||||
};
|
||||
|
||||
classificationPage.setCleanRulings(cleanRulings);
|
||||
@ -347,7 +331,7 @@ public class LayoutParsingPipeline {
|
||||
}
|
||||
}
|
||||
|
||||
classificationDocument.getVisualizations().addMarkedContentVisualizations(stripper.getMarkedContents(), pageNumber);
|
||||
classificationDocument.getLayoutDebugLayer().addMarkedContentVisualizations(stripper.getMarkedContents(), pageNumber);
|
||||
// MarkedContent needs to be converted at this point, otherwise it leads to GC Problems in Pdfbox.
|
||||
classificationPage.setMarkedContentBboxPerType(convertMarkedContents(stripper.getMarkedContents()));
|
||||
|
||||
@ -378,7 +362,7 @@ public class LayoutParsingPipeline {
|
||||
log.info("Calculating BodyTextFrame for {}", identifier);
|
||||
bodyTextFrameService.setBodyTextFrames(classificationDocument, layoutParsingType);
|
||||
for (ClassificationPage page : classificationDocument.getPages()) {
|
||||
classificationDocument.getVisualizations().addCleanRulingVisualization(page.getCleanRulings(), page.getPageNumber());
|
||||
classificationDocument.getLayoutDebugLayer().addCleanRulingVisualization(page.getCleanRulings(), page.getPageNumber());
|
||||
}
|
||||
log.info("Classify TextBlocks for {}", identifier);
|
||||
switch (layoutParsingType) {
|
||||
|
||||
@ -5,10 +5,7 @@ import org.springframework.context.annotation.Bean;
|
||||
import org.springframework.context.annotation.ComponentScan;
|
||||
import org.springframework.context.annotation.Configuration;
|
||||
|
||||
import com.google.common.base.Strings;
|
||||
import com.knecon.fforesight.service.viewerdoc.service.IViewerDocumentService;
|
||||
import com.knecon.fforesight.service.viewerdoc.service.pdftron.PDFTronViewerDocumentService;
|
||||
import com.knecon.fforesight.service.viewerdoc.service.ViewerDocumentService;
|
||||
import com.knecon.fforesight.service.viewerdoc.service.PDFTronViewerDocumentService;
|
||||
|
||||
import io.micrometer.observation.ObservationRegistry;
|
||||
|
||||
@ -18,14 +15,9 @@ public class LayoutParsingServiceProcessorConfiguration {
|
||||
|
||||
@Bean
|
||||
@Autowired
|
||||
public IViewerDocumentService viewerDocumentService(ObservationRegistry registry, LayoutparserSettings settings) {
|
||||
|
||||
if (!Strings.isNullOrEmpty(settings.getPdftronLicense())) {
|
||||
return new PDFTronViewerDocumentService(registry);
|
||||
} else {
|
||||
return new ViewerDocumentService(registry);
|
||||
}
|
||||
public PDFTronViewerDocumentService viewerDocumentService(ObservationRegistry registry) {
|
||||
|
||||
return new PDFTronViewerDocumentService(registry);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@ -21,6 +21,7 @@ import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsi
|
||||
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.image.ImageServiceResponse;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableServiceResponse;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.VisualLayoutParsingResponse;
|
||||
import com.knecon.fforesight.service.viewerdoc.service.ViewerDocVersioningUtility;
|
||||
import com.knecon.fforesight.tenantcommons.TenantContext;
|
||||
|
||||
import io.micrometer.observation.annotation.Observed;
|
||||
@ -36,6 +37,7 @@ public class LayoutParsingStorageService {
|
||||
private final StorageService storageService;
|
||||
private final ObjectMapper objectMapper;
|
||||
|
||||
|
||||
@Observed(name = "LayoutParsingStorageService", contextualName = "get-origin-file")
|
||||
public File getOriginFile(String storageId) throws IOException {
|
||||
|
||||
@ -53,11 +55,18 @@ public class LayoutParsingStorageService {
|
||||
}
|
||||
File tempFile = createTempFile("viewerDocument", ".pdf");
|
||||
storageService.downloadTo(TenantContext.getTenantId(), storageId, tempFile);
|
||||
|
||||
if (!ViewerDocVersioningUtility.isCurrentVersion(tempFile)) {
|
||||
assert tempFile.delete();
|
||||
return Optional.empty();
|
||||
}
|
||||
|
||||
return Optional.of(tempFile);
|
||||
}
|
||||
|
||||
|
||||
public ImageServiceResponse getImagesFile(String storageId) throws IOException {
|
||||
@SneakyThrows
|
||||
public ImageServiceResponse getImagesFile(String storageId) {
|
||||
|
||||
try (InputStream inputStream = getObject(storageId)) {
|
||||
|
||||
@ -68,7 +77,8 @@ public class LayoutParsingStorageService {
|
||||
}
|
||||
|
||||
|
||||
public TableServiceResponse getTablesFile(String storageId) throws IOException {
|
||||
@SneakyThrows
|
||||
public TableServiceResponse getTablesFile(String storageId) {
|
||||
|
||||
try (var tableClassificationStream = getObject(storageId)) {
|
||||
|
||||
@ -78,11 +88,12 @@ public class LayoutParsingStorageService {
|
||||
}
|
||||
}
|
||||
|
||||
public VisualLayoutParsingResponse getVisualLayoutParsingFile(String storageId) throws IOException {
|
||||
|
||||
@SneakyThrows
|
||||
public VisualLayoutParsingResponse getVisualLayoutParsingFile(String storageId) {
|
||||
|
||||
try (InputStream inputStream = getObject(storageId)) {
|
||||
VisualLayoutParsingResponse visualLayoutParsingResponse = objectMapper.readValue(inputStream, VisualLayoutParsingResponse.class);
|
||||
return visualLayoutParsingResponse;
|
||||
return objectMapper.readValue(inputStream, VisualLayoutParsingResponse.class);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@ -18,7 +18,7 @@ import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRul
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.RedTextPosition;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextDirection;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.visualization.LayoutparsingVisualizations;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.visualization.LayoutDebugLayer;
|
||||
|
||||
import lombok.RequiredArgsConstructor;
|
||||
|
||||
@ -33,7 +33,7 @@ public class DocstrumSegmentationService {
|
||||
private final ReadingOrderService readingOrderService;
|
||||
|
||||
|
||||
public List<Zone> segmentPage(List<TextPositionSequence> textPositions, boolean xyOrder, CleanRulings usedRulings, LayoutparsingVisualizations visualizations) {
|
||||
public List<Zone> segmentPage(List<TextPositionSequence> textPositions, boolean xyOrder, CleanRulings usedRulings, LayoutDebugLayer visualizations) {
|
||||
|
||||
List<Zone> zones = new ArrayList<>();
|
||||
zones.addAll(computeZones(textPositions, usedRulings, visualizations, TextDirection.ZERO));
|
||||
@ -45,7 +45,7 @@ public class DocstrumSegmentationService {
|
||||
}
|
||||
|
||||
|
||||
private List<Zone> computeZones(List<TextPositionSequence> textPositions, CleanRulings rulings, LayoutparsingVisualizations visualizations, TextDirection direction) {
|
||||
private List<Zone> computeZones(List<TextPositionSequence> textPositions, CleanRulings rulings, LayoutDebugLayer visualizations, TextDirection direction) {
|
||||
|
||||
List<RedTextPosition> positions = textPositions.stream()
|
||||
.filter(t -> t.getDir() == direction)
|
||||
|
||||
@ -7,7 +7,7 @@ import com.knecon.fforesight.service.layoutparser.processor.model.outline.Outlin
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.outline.TableOfContents;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.StringFrequencyCounter;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.UnclassifiedText;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.visualization.LayoutparsingVisualizations;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.visualization.LayoutDebugLayer;
|
||||
|
||||
import lombok.Data;
|
||||
import lombok.NoArgsConstructor;
|
||||
@ -25,7 +25,7 @@ public class ClassificationDocument {
|
||||
private FloatFrequencyCounter fontSizeCounter = new FloatFrequencyCounter();
|
||||
private StringFrequencyCounter fontCounter = new StringFrequencyCounter();
|
||||
private StringFrequencyCounter fontStyleCounter = new StringFrequencyCounter();
|
||||
private LayoutparsingVisualizations visualizations = new LayoutparsingVisualizations();
|
||||
private LayoutDebugLayer layoutDebugLayer = new LayoutDebugLayer();
|
||||
private boolean headlines;
|
||||
|
||||
private long rulesVersion;
|
||||
|
||||
@ -13,13 +13,13 @@ import lombok.Setter;
|
||||
@Setter
|
||||
@EqualsAndHashCode
|
||||
@SuppressWarnings("PMD.AvoidFieldNameMatchingMethodName")
|
||||
public class Boundary implements Comparable<Boundary> {
|
||||
public class TextRange implements Comparable<TextRange> {
|
||||
|
||||
private int start;
|
||||
private int end;
|
||||
|
||||
|
||||
public Boundary(int start, int end) {
|
||||
public TextRange(int start, int end) {
|
||||
|
||||
if (start > end) {
|
||||
throw new IllegalArgumentException(format("start: %d > end: %d", start, end));
|
||||
@ -47,15 +47,15 @@ public class Boundary implements Comparable<Boundary> {
|
||||
}
|
||||
|
||||
|
||||
public boolean contains(Boundary boundary) {
|
||||
public boolean contains(TextRange textRange) {
|
||||
|
||||
return start <= boundary.start() && boundary.end() <= end;
|
||||
return start <= textRange.start() && textRange.end() <= end;
|
||||
}
|
||||
|
||||
|
||||
public boolean containedBy(Boundary boundary) {
|
||||
public boolean containedBy(TextRange textRange) {
|
||||
|
||||
return boundary.contains(this);
|
||||
return textRange.contains(this);
|
||||
}
|
||||
|
||||
|
||||
@ -83,18 +83,18 @@ public class Boundary implements Comparable<Boundary> {
|
||||
}
|
||||
|
||||
|
||||
public boolean intersects(Boundary boundary) {
|
||||
public boolean intersects(TextRange textRange) {
|
||||
|
||||
return boundary.start() < this.end && this.start < boundary.end();
|
||||
return textRange.start() < this.end && this.start < textRange.end();
|
||||
}
|
||||
|
||||
|
||||
public List<Boundary> split(List<Integer> splitIndices) {
|
||||
public List<TextRange> split(List<Integer> splitIndices) {
|
||||
|
||||
if (splitIndices.stream().anyMatch(idx -> !this.contains(idx))) {
|
||||
throw new IndexOutOfBoundsException(format("%s splitting indices are out of range for %s", splitIndices.stream().filter(idx -> !this.contains(idx)).toList(), this));
|
||||
}
|
||||
List<Boundary> splitBoundaries = new LinkedList<>();
|
||||
List<TextRange> splitBoundaries = new LinkedList<>();
|
||||
int previousIndex = start;
|
||||
for (int splitIndex : splitIndices) {
|
||||
|
||||
@ -102,10 +102,10 @@ public class Boundary implements Comparable<Boundary> {
|
||||
if (splitIndex == previousIndex) {
|
||||
continue;
|
||||
}
|
||||
splitBoundaries.add(new Boundary(previousIndex, splitIndex));
|
||||
splitBoundaries.add(new TextRange(previousIndex, splitIndex));
|
||||
previousIndex = splitIndex;
|
||||
}
|
||||
splitBoundaries.add(new Boundary(previousIndex, end));
|
||||
splitBoundaries.add(new TextRange(previousIndex, end));
|
||||
return splitBoundaries;
|
||||
}
|
||||
|
||||
@ -114,11 +114,11 @@ public class Boundary implements Comparable<Boundary> {
|
||||
return IntStream.range(start, end);
|
||||
}
|
||||
|
||||
public static Boundary merge(Collection<Boundary> boundaries) {
|
||||
public static TextRange merge(Collection<TextRange> boundaries) {
|
||||
|
||||
int minStart = boundaries.stream().mapToInt(Boundary::start).min().orElseThrow(IllegalArgumentException::new);
|
||||
int maxEnd = boundaries.stream().mapToInt(Boundary::end).max().orElseThrow(IllegalArgumentException::new);
|
||||
return new Boundary(minStart, maxEnd);
|
||||
int minStart = boundaries.stream().mapToInt(TextRange::start).min().orElseThrow(IllegalArgumentException::new);
|
||||
int maxEnd = boundaries.stream().mapToInt(TextRange::end).max().orElseThrow(IllegalArgumentException::new);
|
||||
return new TextRange(minStart, maxEnd);
|
||||
}
|
||||
|
||||
|
||||
@ -130,12 +130,12 @@ public class Boundary implements Comparable<Boundary> {
|
||||
|
||||
|
||||
@Override
|
||||
public int compareTo(Boundary boundary) {
|
||||
public int compareTo(TextRange textRange) {
|
||||
|
||||
if (end < boundary.end() && start < boundary.start()) {
|
||||
if (end < textRange.end() && start < textRange.start()) {
|
||||
return -1;
|
||||
}
|
||||
if (start > boundary.start() && end > boundary.end()) {
|
||||
if (start > textRange.start() && end > textRange.end()) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
@ -11,7 +11,7 @@ import java.util.Map;
|
||||
import java.util.Set;
|
||||
|
||||
import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.Engine;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.Boundary;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.TextRange;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Page;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.SemanticNode;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.IdBuilder;
|
||||
@ -28,11 +28,11 @@ import lombok.experimental.FieldDefaults;
|
||||
@AllArgsConstructor
|
||||
@FieldDefaults(level = AccessLevel.PRIVATE)
|
||||
@EqualsAndHashCode(onlyExplicitlyIncluded = true)
|
||||
public class RedactionEntity {
|
||||
public class TextEntity {
|
||||
|
||||
// initial values
|
||||
@EqualsAndHashCode.Include
|
||||
final Boundary boundary;
|
||||
final TextRange textRange;
|
||||
@EqualsAndHashCode.Include
|
||||
final String type;
|
||||
@EqualsAndHashCode.Include
|
||||
@ -47,7 +47,7 @@ public class RedactionEntity {
|
||||
boolean dictionaryEntry;
|
||||
boolean dossierDictionaryEntry;
|
||||
Set<Engine> engines;
|
||||
Set<RedactionEntity> references;
|
||||
Set<TextEntity> references;
|
||||
@Builder.Default
|
||||
Deque<Integer> matchedRules = new LinkedList<>();
|
||||
String redactionReason;
|
||||
@ -66,9 +66,9 @@ public class RedactionEntity {
|
||||
SemanticNode deepestFullyContainingNode;
|
||||
|
||||
|
||||
public static RedactionEntity initialEntityNode(Boundary boundary, String type, EntityType entityType) {
|
||||
public static TextEntity initialEntityNode(TextRange textRange, String type, EntityType entityType) {
|
||||
|
||||
return RedactionEntity.builder().type(type).entityType(entityType).boundary(boundary).engines(new HashSet<>()).references(new HashSet<>()).build();
|
||||
return TextEntity.builder().type(type).entityType(entityType).textRange(textRange).engines(new HashSet<>()).references(new HashSet<>()).build();
|
||||
}
|
||||
|
||||
|
||||
@ -132,7 +132,7 @@ public class RedactionEntity {
|
||||
public List<RedactionPosition> getRedactionPositionsPerPage() {
|
||||
|
||||
if (redactionPositionsPerPage == null || redactionPositionsPerPage.isEmpty()) {
|
||||
Map<Page, List<Rectangle2D>> rectanglesPerLinePerPage = deepestFullyContainingNode.getTextBlock().getPositionsPerPage(boundary);
|
||||
Map<Page, List<Rectangle2D>> rectanglesPerLinePerPage = deepestFullyContainingNode.getTextBlock().getPositionsPerPage(textRange);
|
||||
|
||||
Page firstPage = rectanglesPerLinePerPage.keySet()
|
||||
.stream()
|
||||
@ -155,21 +155,21 @@ public class RedactionEntity {
|
||||
}
|
||||
|
||||
|
||||
public boolean containedBy(RedactionEntity redactionEntity) {
|
||||
public boolean containedBy(TextEntity textEntity) {
|
||||
|
||||
return this.boundary.containedBy(redactionEntity.getBoundary());
|
||||
return this.textRange.containedBy(textEntity.getTextRange());
|
||||
}
|
||||
|
||||
|
||||
public boolean contains(RedactionEntity redactionEntity) {
|
||||
public boolean contains(TextEntity textEntity) {
|
||||
|
||||
return this.boundary.contains(redactionEntity.getBoundary());
|
||||
return this.textRange.contains(textEntity.getTextRange());
|
||||
}
|
||||
|
||||
|
||||
public boolean intersects(RedactionEntity redactionEntity) {
|
||||
public boolean intersects(TextEntity textEntity) {
|
||||
|
||||
return this.boundary.intersects(redactionEntity.getBoundary());
|
||||
return this.textRange.intersects(textEntity.getTextRange());
|
||||
}
|
||||
|
||||
|
||||
@ -185,13 +185,13 @@ public class RedactionEntity {
|
||||
}
|
||||
|
||||
|
||||
public void addReference(RedactionEntity reference) {
|
||||
public void addReference(TextEntity reference) {
|
||||
|
||||
references.add(reference);
|
||||
}
|
||||
|
||||
|
||||
public void addReferences(List<RedactionEntity> references) {
|
||||
public void addReferences(List<TextEntity> references) {
|
||||
|
||||
this.references.addAll(references);
|
||||
}
|
||||
@ -210,7 +210,7 @@ public class RedactionEntity {
|
||||
sb.append("Entity[\"");
|
||||
sb.append(value);
|
||||
sb.append("\", ");
|
||||
sb.append(boundary);
|
||||
sb.append(textRange);
|
||||
sb.append(", pages[");
|
||||
pages.forEach(page -> {
|
||||
sb.append(page.getNumber());
|
||||
@ -8,7 +8,7 @@ import java.util.Set;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.LayoutEngine;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.DocumentTree;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.entity.RedactionEntity;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.entity.TextEntity;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlock;
|
||||
|
||||
import lombok.AccessLevel;
|
||||
@ -39,7 +39,7 @@ public abstract class AbstractSemanticNode implements GenericSemanticNode {
|
||||
|
||||
@Builder.Default
|
||||
@EqualsAndHashCode.Exclude
|
||||
Set<RedactionEntity> entities = new HashSet<>();
|
||||
Set<TextEntity> entities = new HashSet<>();
|
||||
|
||||
@EqualsAndHashCode.Exclude
|
||||
Map<Page, Rectangle2D> bBoxCache;
|
||||
|
||||
@ -12,7 +12,7 @@ import java.util.stream.Stream;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeType;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.DocumentTree;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.visualization.LayoutparsingVisualizations;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.visualization.LayoutDebugLayer;
|
||||
|
||||
import lombok.AccessLevel;
|
||||
import lombok.AllArgsConstructor;
|
||||
@ -33,7 +33,7 @@ public class Document extends AbstractSemanticNode {
|
||||
Set<Page> pages;
|
||||
Integer numberOfPages;
|
||||
|
||||
LayoutparsingVisualizations visualizations;
|
||||
LayoutDebugLayer layoutDebugLayer;
|
||||
|
||||
|
||||
@Override
|
||||
|
||||
@ -92,4 +92,16 @@ public class Image extends AbstractSemanticNode {
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
public double getArea() {
|
||||
|
||||
return position.getWidth() * position.getHeight();
|
||||
}
|
||||
|
||||
|
||||
public boolean isFullPageImage() {
|
||||
|
||||
return imageType.equals(ImageType.OCR) || getArea() >= 0.5 * page.getArea();
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@ -6,7 +6,6 @@ public enum ImageType {
|
||||
LOGO,
|
||||
FORMULA,
|
||||
SIGNATURE,
|
||||
|
||||
SIGNATURE_VISUAL,
|
||||
OTHER,
|
||||
OCR,
|
||||
|
||||
@ -6,7 +6,7 @@ import java.util.List;
|
||||
import java.util.Set;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.entity.RedactionEntity;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.entity.TextEntity;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlockCollector;
|
||||
|
||||
@ -39,7 +39,7 @@ public class Page {
|
||||
|
||||
@Builder.Default
|
||||
@EqualsAndHashCode.Exclude
|
||||
Set<RedactionEntity> entities = new HashSet<>();
|
||||
Set<TextEntity> entities = new HashSet<>();
|
||||
|
||||
@Builder.Default
|
||||
@EqualsAndHashCode.Exclude
|
||||
@ -60,7 +60,10 @@ public class Page {
|
||||
|
||||
public TextBlock getMainBodyTextBlock() {
|
||||
|
||||
return mainBody.stream().filter(SemanticNode::isLeaf).map(SemanticNode::getLeafTextBlock).collect(new TextBlockCollector());
|
||||
return mainBody.stream()
|
||||
.filter(SemanticNode::isLeaf)
|
||||
.map(SemanticNode::getLeafTextBlock)
|
||||
.collect(new TextBlockCollector());
|
||||
}
|
||||
|
||||
|
||||
@ -84,4 +87,10 @@ public class Page {
|
||||
return o instanceof Page && o.hashCode() == this.hashCode();
|
||||
}
|
||||
|
||||
|
||||
public double getArea() {
|
||||
|
||||
return height * width;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@ -14,13 +14,14 @@ import java.util.stream.Stream;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.LayoutEngine;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeType;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.Boundary;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.DocumentTree;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.TextRange;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.entity.EntityType;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.entity.RedactionEntity;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.entity.TextEntity;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.AtomicTextBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlockCollector;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.BBoxMergingUtility;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations;
|
||||
|
||||
public interface SemanticNode {
|
||||
@ -42,7 +43,9 @@ public interface SemanticNode {
|
||||
*/
|
||||
default TextBlock getTextBlock() {
|
||||
|
||||
return streamAllSubNodes().filter(SemanticNode::isLeaf).map(SemanticNode::getTextBlock).collect(new TextBlockCollector());
|
||||
return streamAllSubNodes().filter(SemanticNode::isLeaf)
|
||||
.map(SemanticNode::getTextBlock)
|
||||
.collect(new TextBlockCollector());
|
||||
}
|
||||
|
||||
|
||||
@ -52,7 +55,7 @@ public interface SemanticNode {
|
||||
*
|
||||
* @return Set of all Entities associated with this Node
|
||||
*/
|
||||
Set<RedactionEntity> getEntities();
|
||||
Set<TextEntity> getEntities();
|
||||
|
||||
|
||||
/**
|
||||
@ -68,7 +71,10 @@ public interface SemanticNode {
|
||||
|
||||
default Page getFirstPage() {
|
||||
|
||||
return getTextBlock().getPages().stream().min(Comparator.comparingInt(Page::getNumber)).orElseThrow(() -> new IllegalStateException("SemanticNode has no Page!"));
|
||||
return getTextBlock().getPages()
|
||||
.stream()
|
||||
.min(Comparator.comparingInt(Page::getNumber))
|
||||
.orElseThrow(() -> new IllegalStateException("SemanticNode has no Page!"));
|
||||
}
|
||||
|
||||
|
||||
@ -77,18 +83,19 @@ public interface SemanticNode {
|
||||
*
|
||||
* @return Set of PageNodes this node appears on.
|
||||
*/
|
||||
default Set<Page> getPages(Boundary boundary) {
|
||||
default Set<Page> getPages(TextRange textRange) {
|
||||
|
||||
if (!getBoundary().contains(boundary)) {
|
||||
throw new IllegalArgumentException(format("%s which was used to query for pages is not contained in the %s of this node!", boundary, getBoundary()));
|
||||
if (!getBoundary().contains(textRange)) {
|
||||
throw new IllegalArgumentException(format("%s which was used to query for pages is not contained in the %s of this node!", textRange, getBoundary()));
|
||||
}
|
||||
return getTextBlock().getPages(boundary);
|
||||
return getTextBlock().getPages(textRange);
|
||||
}
|
||||
|
||||
|
||||
default boolean isOnPage(int pageNumber) {
|
||||
|
||||
return getPages().stream().anyMatch(page -> page.getNumber() == pageNumber);
|
||||
return getPages().stream()
|
||||
.anyMatch(page -> page.getNumber() == pageNumber);
|
||||
}
|
||||
|
||||
|
||||
@ -203,7 +210,9 @@ public interface SemanticNode {
|
||||
*/
|
||||
default boolean hasEntitiesOfType(String type) {
|
||||
|
||||
return getEntities().stream().filter(entity -> entity.getEntityType().equals(EntityType.ENTITY)).anyMatch(redactionEntity -> redactionEntity.getType().equals(type));
|
||||
return getEntities().stream()
|
||||
.filter(entity -> entity.getEntityType().equals(EntityType.ENTITY))
|
||||
.anyMatch(redactionEntity -> redactionEntity.getType().equals(type));
|
||||
}
|
||||
|
||||
|
||||
@ -213,9 +222,11 @@ public interface SemanticNode {
|
||||
* @param type string representing the type of entities to return
|
||||
* @return List of RedactionEntities of any the type
|
||||
*/
|
||||
default List<RedactionEntity> getEntitiesOfType(String type) {
|
||||
default List<TextEntity> getEntitiesOfType(String type) {
|
||||
|
||||
return getEntities().stream().filter(redactionEntity -> redactionEntity.getType().equals(type)).toList();
|
||||
return getEntities().stream()
|
||||
.filter(redactionEntity -> redactionEntity.getType().equals(type))
|
||||
.toList();
|
||||
}
|
||||
|
||||
|
||||
@ -225,9 +236,11 @@ public interface SemanticNode {
|
||||
* @param types A list of strings representing the types of entities to return
|
||||
* @return List of RedactionEntities of any provided type
|
||||
*/
|
||||
default List<RedactionEntity> getEntitiesOfType(List<String> types) {
|
||||
default List<TextEntity> getEntitiesOfType(List<String> types) {
|
||||
|
||||
return getEntities().stream().filter(redactionEntity -> redactionEntity.isAnyType(types)).toList();
|
||||
return getEntities().stream()
|
||||
.filter(redactionEntity -> redactionEntity.isAnyType(types))
|
||||
.toList();
|
||||
}
|
||||
|
||||
|
||||
@ -241,7 +254,8 @@ public interface SemanticNode {
|
||||
|
||||
TextBlock textBlock = getTextBlock();
|
||||
if (!textBlock.getAtomicTextBlocks().isEmpty()) {
|
||||
return getTextBlock().getAtomicTextBlocks().get(0).getNumberOnPage();
|
||||
return getTextBlock().getAtomicTextBlocks()
|
||||
.get(0).getNumberOnPage();
|
||||
} else {
|
||||
return -1;
|
||||
}
|
||||
@ -279,7 +293,8 @@ public interface SemanticNode {
|
||||
*/
|
||||
default boolean containsStrings(List<String> strings) {
|
||||
|
||||
return strings.stream().allMatch(this::containsString);
|
||||
return strings.stream()
|
||||
.allMatch(this::containsString);
|
||||
}
|
||||
|
||||
|
||||
@ -303,7 +318,8 @@ public interface SemanticNode {
|
||||
*/
|
||||
default boolean containsAnyString(List<String> strings) {
|
||||
|
||||
return strings.stream().anyMatch(this::containsString);
|
||||
return strings.stream()
|
||||
.anyMatch(this::containsString);
|
||||
}
|
||||
|
||||
|
||||
@ -315,7 +331,8 @@ public interface SemanticNode {
|
||||
*/
|
||||
default boolean containsAnyStringIgnoreCase(List<String> strings) {
|
||||
|
||||
return strings.stream().anyMatch(this::containsStringIgnoreCase);
|
||||
return strings.stream()
|
||||
.anyMatch(this::containsStringIgnoreCase);
|
||||
}
|
||||
|
||||
|
||||
@ -323,19 +340,19 @@ public interface SemanticNode {
|
||||
* This function is used during insertion of EntityNodes into the graph, it checks if the boundary of the RedactionEntity intersects or even contains the RedactionEntity.
|
||||
* It sets the fields accordingly and recursively calls this function on all its children.
|
||||
*
|
||||
* @param redactionEntity RedactionEntity, which is being inserted into the graph
|
||||
* @param textEntity RedactionEntity, which is being inserted into the graph
|
||||
*/
|
||||
default void addThisToEntityIfIntersects(RedactionEntity redactionEntity) {
|
||||
default void addThisToEntityIfIntersects(TextEntity textEntity) {
|
||||
|
||||
TextBlock textBlock = getTextBlock();
|
||||
if (textBlock.getBoundary().intersects(redactionEntity.getBoundary())) {
|
||||
if (textBlock.containsBoundary(redactionEntity.getBoundary())) {
|
||||
redactionEntity.setDeepestFullyContainingNode(this);
|
||||
if (textBlock.getTextRange().intersects(textEntity.getTextRange())) {
|
||||
if (textBlock.containsBoundary(textEntity.getTextRange())) {
|
||||
textEntity.setDeepestFullyContainingNode(this);
|
||||
}
|
||||
|
||||
redactionEntity.addIntersectingNode(this);
|
||||
streamChildren().filter(semanticNode -> semanticNode.getBoundary().intersects(redactionEntity.getBoundary()))
|
||||
.forEach(node -> node.addThisToEntityIfIntersects(redactionEntity));
|
||||
textEntity.addIntersectingNode(this);
|
||||
streamChildren().filter(semanticNode -> semanticNode.getBoundary().intersects(textEntity.getTextRange()))
|
||||
.forEach(node -> node.addThisToEntityIfIntersects(textEntity));
|
||||
}
|
||||
}
|
||||
|
||||
@ -386,7 +403,8 @@ public interface SemanticNode {
|
||||
*/
|
||||
default Stream<SemanticNode> streamAllSubNodes() {
|
||||
|
||||
return getDocumentTree().allSubEntriesInOrder(getTreeId()).map(DocumentTree.Entry::getNode);
|
||||
return getDocumentTree().allSubEntriesInOrder(getTreeId())
|
||||
.map(DocumentTree.Entry::getNode);
|
||||
}
|
||||
|
||||
|
||||
@ -397,7 +415,9 @@ public interface SemanticNode {
|
||||
*/
|
||||
default Stream<SemanticNode> streamAllSubNodesOfType(NodeType nodeType) {
|
||||
|
||||
return getDocumentTree().allSubEntriesInOrder(getTreeId()).filter(entry -> entry.getType().equals(nodeType)).map(DocumentTree.Entry::getNode);
|
||||
return getDocumentTree().allSubEntriesInOrder(getTreeId())
|
||||
.filter(entry -> entry.getType().equals(nodeType))
|
||||
.map(DocumentTree.Entry::getNode);
|
||||
}
|
||||
|
||||
|
||||
@ -406,9 +426,9 @@ public interface SemanticNode {
|
||||
*
|
||||
* @return Boundary of this Node's TextBlock
|
||||
*/
|
||||
default Boundary getBoundary() {
|
||||
default TextRange getBoundary() {
|
||||
|
||||
return getTextBlock().getBoundary();
|
||||
return getTextBlock().getTextRange();
|
||||
}
|
||||
|
||||
|
||||
@ -453,17 +473,19 @@ public interface SemanticNode {
|
||||
*/
|
||||
private Map<Page, Rectangle2D> getBBoxFromChildren() {
|
||||
|
||||
Map<Page, Rectangle2D> bBoxPerPage = new HashMap<>();
|
||||
List<Map<Page, Rectangle2D>> childrenBBoxes = streamChildren().map(SemanticNode::getBBox).toList();
|
||||
Set<Page> pages = childrenBBoxes.stream().flatMap(map -> map.keySet().stream()).collect(Collectors.toSet());
|
||||
for (Page page : pages) {
|
||||
Rectangle2D bBoxOnPage = childrenBBoxes.stream()
|
||||
.filter(childBboxPerPage -> childBboxPerPage.containsKey(page))
|
||||
.map(childBboxPerPage -> childBboxPerPage.get(page))
|
||||
.collect(RectangleTransformations.collectBBox());
|
||||
bBoxPerPage.put(page, bBoxOnPage);
|
||||
List<Map<Page, Rectangle2D>> childrenBBoxes = streamChildren().filter(child -> !isFullPageImage(child))
|
||||
.map(SemanticNode::getBBox)
|
||||
.toList();
|
||||
return BBoxMergingUtility.mergeBBoxes(childrenBBoxes);
|
||||
}
|
||||
|
||||
|
||||
private static boolean isFullPageImage(SemanticNode child) {
|
||||
|
||||
if (!child.getType().equals(NodeType.IMAGE)) {
|
||||
return false;
|
||||
}
|
||||
return bBoxPerPage;
|
||||
return ((Image) child).isFullPageImage();
|
||||
}
|
||||
|
||||
|
||||
@ -473,7 +495,9 @@ public interface SemanticNode {
|
||||
private Map<Page, Rectangle2D> getBBoxFromLeafTextBlock() {
|
||||
|
||||
Map<Page, Rectangle2D> bBoxPerPage = new HashMap<>();
|
||||
Map<Page, List<AtomicTextBlock>> atomicTextBlockPerPage = getTextBlock().getAtomicTextBlocks().stream().collect(Collectors.groupingBy(AtomicTextBlock::getPage));
|
||||
Map<Page, List<AtomicTextBlock>> atomicTextBlockPerPage = getTextBlock().getAtomicTextBlocks()
|
||||
.stream()
|
||||
.collect(Collectors.groupingBy(AtomicTextBlock::getPage));
|
||||
atomicTextBlockPerPage.forEach((page, atbs) -> bBoxPerPage.put(page, RectangleTransformations.bBoxUnionAtomicTextBlock(atbs)));
|
||||
return bBoxPerPage;
|
||||
}
|
||||
|
||||
@ -15,7 +15,7 @@ import java.util.stream.Stream;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.LayoutEngine;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeType;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.DocumentTree;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.entity.RedactionEntity;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.entity.TextEntity;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlockCollector;
|
||||
|
||||
@ -43,7 +43,7 @@ public class Table implements SemanticNode {
|
||||
|
||||
@Builder.Default
|
||||
@EqualsAndHashCode.Exclude
|
||||
Set<RedactionEntity> entities = new HashSet<>();
|
||||
Set<TextEntity> entities = new HashSet<>();
|
||||
|
||||
@EqualsAndHashCode.Exclude
|
||||
Map<Page, Rectangle2D> bBoxCache;
|
||||
@ -54,7 +54,7 @@ public class Table implements SemanticNode {
|
||||
* @param strings Strings to check whether a row contains them
|
||||
* @return Stream of all entities in this table, that appear in a row, which contains any of the provided strings
|
||||
*/
|
||||
public Stream<RedactionEntity> streamEntitiesWhereRowContainsStringsIgnoreCase(List<String> strings) {
|
||||
public Stream<TextEntity> streamEntitiesWhereRowContainsStringsIgnoreCase(List<String> strings) {
|
||||
|
||||
return IntStream.range(0, numberOfRows).boxed()
|
||||
.filter(row -> rowContainsStringsIgnoreCase(row, strings))
|
||||
@ -88,7 +88,7 @@ public class Table implements SemanticNode {
|
||||
* @param value the string which the table cell should contain
|
||||
* @return a stream of all entities, which appear in a row where at least one cell has the provided header and the provided value.
|
||||
*/
|
||||
public Stream<RedactionEntity> streamEntitiesWhereRowHasHeaderAndValue(String header, String value) {
|
||||
public Stream<TextEntity> streamEntitiesWhereRowHasHeaderAndValue(String header, String value) {
|
||||
|
||||
List<Integer> vertebrateStudyCols = streamHeaders().filter(headerNode -> headerNode.containsString(header))
|
||||
.map(TableCell::getCol)
|
||||
@ -107,7 +107,7 @@ public class Table implements SemanticNode {
|
||||
* @param values the strings which the table cell should contain
|
||||
* @return a stream of all entities, which appear in a row where at least one cell has the provided header and any provided value.
|
||||
*/
|
||||
public Stream<RedactionEntity> streamEntitiesWhereRowHasHeaderAndAnyValue(String header, List<String> values) {
|
||||
public Stream<TextEntity> streamEntitiesWhereRowHasHeaderAndAnyValue(String header, List<String> values) {
|
||||
|
||||
List<Integer> colsWithHeader = streamHeaders().filter(headerNode -> headerNode.containsString(header))
|
||||
.map(TableCell::getCol)
|
||||
@ -125,7 +125,7 @@ public class Table implements SemanticNode {
|
||||
* @param types type strings to check whether a row contains an entity like them
|
||||
* @return Stream of all entities in this table, that appear in a row, which contains at least one entity with any of the provided types.
|
||||
*/
|
||||
public Stream<RedactionEntity> streamEntitiesWhereRowContainsEntitiesOfType(List<String> types) {
|
||||
public Stream<TextEntity> streamEntitiesWhereRowContainsEntitiesOfType(List<String> types) {
|
||||
|
||||
List<Integer> rowsWithEntityOfType = IntStream.range(0, numberOfRows).boxed()
|
||||
.filter(rowNumber -> streamEntityTypesInRow(rowNumber).anyMatch(existingType -> types.stream()
|
||||
@ -145,7 +145,7 @@ public class Table implements SemanticNode {
|
||||
* @param types type strings to check whether a row doesn't contain an entity like it
|
||||
* @return Stream of all entities in this table, that appear in a row, which does not contain any entity with any of the provided types.
|
||||
*/
|
||||
public Stream<RedactionEntity> streamEntitiesWhereRowContainsNoEntitiesOfType(List<String> types) {
|
||||
public Stream<TextEntity> streamEntitiesWhereRowContainsNoEntitiesOfType(List<String> types) {
|
||||
|
||||
List<Integer> rowsWithNoEntityOfType = IntStream.range(0, numberOfRows).boxed()
|
||||
.filter(rowNumber -> streamEntityTypesInRow(rowNumber).noneMatch(existingType -> types.stream()
|
||||
@ -163,7 +163,7 @@ public class Table implements SemanticNode {
|
||||
|
||||
return streamRow(rowNumber).map(TableCell::getEntities)
|
||||
.flatMap(Collection::stream)
|
||||
.map(RedactionEntity::getType)
|
||||
.map(TextEntity::getType)
|
||||
.distinct();
|
||||
}
|
||||
|
||||
@ -304,12 +304,12 @@ public class Table implements SemanticNode {
|
||||
* Finds all entities of the provided type, which appear in the same row that the provided entity appears in.
|
||||
*
|
||||
* @param type the type of entities to search for
|
||||
* @param redactionEntity the entity, which appears in the row to search
|
||||
* @param textEntity the entity, which appears in the row to search
|
||||
* @return List of all entities of the provided type, which appear in the same row that the provided entity appears in.
|
||||
*/
|
||||
public List<RedactionEntity> getEntitiesOfTypeInSameRow(String type, RedactionEntity redactionEntity) {
|
||||
public List<TextEntity> getEntitiesOfTypeInSameRow(String type, TextEntity textEntity) {
|
||||
|
||||
return redactionEntity.getIntersectingNodes()
|
||||
return textEntity.getIntersectingNodes()
|
||||
.stream()
|
||||
.filter(node -> node instanceof TableCell)
|
||||
.map(node -> (TableCell) node)
|
||||
|
||||
@ -13,7 +13,7 @@ import java.util.Map;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentPositionData;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentTextData;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.Boundary;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.TextRange;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Page;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.SemanticNode;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations;
|
||||
@ -36,14 +36,14 @@ public class AtomicTextBlock implements TextBlock {
|
||||
Page page;
|
||||
|
||||
//string coordinates
|
||||
Boundary boundary;
|
||||
TextRange textRange;
|
||||
String searchText;
|
||||
@Builder.Default
|
||||
List<Integer> lineBreaks = new ArrayList<>();
|
||||
@Builder.Default
|
||||
List<Boundary> boldTextBoundaries = new ArrayList<>();
|
||||
List<TextRange> boldTextBoundaries = new ArrayList<>();
|
||||
@Builder.Default
|
||||
List<Boundary> italicTextBoundaries = new ArrayList<>();
|
||||
List<TextRange> italicTextBoundaries = new ArrayList<>();
|
||||
String orientation;
|
||||
int textDirection;
|
||||
|
||||
@ -66,8 +66,8 @@ public class AtomicTextBlock implements TextBlock {
|
||||
|
||||
public static AtomicTextBlock fromSearchTextWithTextPosition(String searchText,
|
||||
List<Integer> lineBreaks,
|
||||
List<Boundary> boldTextBoundaries,
|
||||
List<Boundary> italicTextBoundaries,
|
||||
List<TextRange> boldTextBoundaries,
|
||||
List<TextRange> italicTextBoundaries,
|
||||
List<Rectangle2D> positions,
|
||||
List<Integer> stringIdxToPositionIdx,
|
||||
long idx,
|
||||
@ -89,7 +89,7 @@ public class AtomicTextBlock implements TextBlock {
|
||||
.italicTextBoundaries(italicTextBoundaries)
|
||||
.positions(positions)
|
||||
.stringIdxToPositionIdx(stringIdxToPositionIdx)
|
||||
.boundary(new Boundary(offset, offset + searchText.length()))
|
||||
.textRange(new TextRange(offset, offset + searchText.length()))
|
||||
.textDirection(textDirection)
|
||||
.orientation(orientation)
|
||||
.build();
|
||||
@ -100,7 +100,7 @@ public class AtomicTextBlock implements TextBlock {
|
||||
|
||||
return AtomicTextBlock.builder()
|
||||
.id(textBlockIdx)
|
||||
.boundary(new Boundary(stringOffset, stringOffset))
|
||||
.textRange(new TextRange(stringOffset, stringOffset))
|
||||
.searchText("")
|
||||
.page(page)
|
||||
.numberOnPage(numberOnPage)
|
||||
@ -118,7 +118,7 @@ public class AtomicTextBlock implements TextBlock {
|
||||
.id(documentTextData.getId())
|
||||
.numberOnPage(documentTextData.getNumberOnPage())
|
||||
.page(page)
|
||||
.boundary(new Boundary(documentTextData.getStart(), documentTextData.getEnd()))
|
||||
.textRange(new TextRange(documentTextData.getStart(), documentTextData.getEnd()))
|
||||
.searchText(documentTextData.getSearchText())
|
||||
.lineBreaks(Arrays.stream(documentTextData.getLineBreaks()).boxed().toList())
|
||||
.stringIdxToPositionIdx(Arrays.stream(documentPositionData.getStringIdxToPositionIdx()).boxed().toList())
|
||||
@ -140,11 +140,11 @@ public class AtomicTextBlock implements TextBlock {
|
||||
throw new IndexOutOfBoundsException(format("line %d out of range for AtomicTextBlock with %d lines", lineNumber, numberOfLines()));
|
||||
}
|
||||
if (lineNumber == 0) {
|
||||
return subSequence(boundary.start(), lineBreaks.get(0) + boundary.start());
|
||||
return subSequence(textRange.start(), lineBreaks.get(0) + textRange.start());
|
||||
} else if (lineNumber == numberOfLines() - 1) {
|
||||
return subSequence(lineBreaks.get(lineBreaks.size() - 1) + boundary.start(), boundary.end());
|
||||
return subSequence(lineBreaks.get(lineBreaks.size() - 1) + textRange.start(), textRange.end());
|
||||
}
|
||||
return subSequence(lineBreaks.get(lineNumber - 1) + boundary.start(), lineBreaks.get(lineNumber) + boundary.start());
|
||||
return subSequence(lineBreaks.get(lineNumber - 1) + textRange.start(), lineBreaks.get(lineNumber) + textRange.start());
|
||||
}
|
||||
|
||||
|
||||
@ -159,9 +159,9 @@ public class AtomicTextBlock implements TextBlock {
|
||||
public int getNextLinebreak(int fromIndex) {
|
||||
|
||||
return lineBreaks.stream()//
|
||||
.filter(linebreak -> linebreak > fromIndex - boundary.start()) //
|
||||
.filter(linebreak -> linebreak > fromIndex - textRange.start()) //
|
||||
.findFirst() //
|
||||
.orElse(searchText.length()) + boundary.start();
|
||||
.orElse(searchText.length()) + textRange.start();
|
||||
}
|
||||
|
||||
|
||||
@ -169,43 +169,43 @@ public class AtomicTextBlock implements TextBlock {
|
||||
public int getPreviousLinebreak(int fromIndex) {
|
||||
|
||||
return lineBreaks.stream()//
|
||||
.filter(linebreak -> linebreak <= fromIndex - boundary.start())//
|
||||
.filter(linebreak -> linebreak <= fromIndex - textRange.start())//
|
||||
.reduce((a, b) -> b)//
|
||||
.orElse(0) + boundary.start();
|
||||
.orElse(0) + textRange.start();
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public Rectangle2D getPosition(int stringIdx) {
|
||||
|
||||
return positions.get(stringIdxToPositionIdx.get(stringIdx - boundary.start()));
|
||||
return positions.get(stringIdxToPositionIdx.get(stringIdx - textRange.start()));
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public List<Rectangle2D> getPositions(Boundary stringBoundary) {
|
||||
public List<Rectangle2D> getPositions(TextRange stringTextRange) {
|
||||
|
||||
if (!containsBoundary(stringBoundary)) {
|
||||
throw new IndexOutOfBoundsException(format("%s is out of bounds for %s", stringBoundary, this.boundary));
|
||||
if (!containsBoundary(stringTextRange)) {
|
||||
throw new IndexOutOfBoundsException(format("%s is out of bounds for %s", stringTextRange, this.textRange));
|
||||
}
|
||||
if (stringBoundary.length() == 0) {
|
||||
if (stringTextRange.length() == 0) {
|
||||
return Collections.emptyList();
|
||||
}
|
||||
|
||||
int startPositionIdx = stringIdxToPositionIdx.get(stringBoundary.start() - this.boundary.start());
|
||||
int startPositionIdx = stringIdxToPositionIdx.get(stringTextRange.start() - this.textRange.start());
|
||||
|
||||
if (stringBoundary.end() == this.boundary.end()) {
|
||||
if (stringTextRange.end() == this.textRange.end()) {
|
||||
return positions.subList(startPositionIdx, positions.size());
|
||||
}
|
||||
|
||||
return positions.subList(startPositionIdx, stringIdxToPositionIdx.get(stringBoundary.end() - this.boundary.start()));
|
||||
return positions.subList(startPositionIdx, stringIdxToPositionIdx.get(stringTextRange.end() - this.textRange.start()));
|
||||
|
||||
}
|
||||
|
||||
|
||||
public Map<Page, List<Rectangle2D>> getPositionsPerPage(Boundary stringBoundary) {
|
||||
public Map<Page, List<Rectangle2D>> getPositionsPerPage(TextRange stringTextRange) {
|
||||
|
||||
List<Rectangle2D> rectanglesPerLine = stringBoundary.split(getAllLineBreaksInBoundary(stringBoundary))
|
||||
List<Rectangle2D> rectanglesPerLine = stringTextRange.split(getAllLineBreaksInBoundary(stringTextRange))
|
||||
.stream()
|
||||
.map(this::getPositions)
|
||||
.map(RectangleTransformations::rectangleBBoxWithGaps)
|
||||
@ -217,9 +217,9 @@ public class AtomicTextBlock implements TextBlock {
|
||||
}
|
||||
|
||||
|
||||
private List<Integer> getAllLineBreaksInBoundary(Boundary boundary) {
|
||||
protected List<Integer> getAllLineBreaksInBoundary(TextRange textRange) {
|
||||
|
||||
return getLineBreaks().stream().map(linebreak -> linebreak + this.boundary.start()).filter(boundary::contains).toList();
|
||||
return getLineBreaks().stream().map(linebreak -> linebreak + this.textRange.start()).filter(textRange::contains).toList();
|
||||
}
|
||||
|
||||
|
||||
|
||||
@ -11,7 +11,7 @@ import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.stream.Stream;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.Boundary;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.TextRange;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Page;
|
||||
|
||||
import lombok.AccessLevel;
|
||||
@ -24,7 +24,7 @@ public class ConcatenatedTextBlock implements TextBlock {
|
||||
|
||||
List<AtomicTextBlock> atomicTextBlocks;
|
||||
String searchText;
|
||||
Boundary boundary;
|
||||
TextRange textRange;
|
||||
|
||||
|
||||
public static ConcatenatedTextBlock empty() {
|
||||
@ -37,12 +37,12 @@ public class ConcatenatedTextBlock implements TextBlock {
|
||||
|
||||
this.atomicTextBlocks = new LinkedList<>();
|
||||
if (atomicTextBlocks.isEmpty()) {
|
||||
boundary = new Boundary(-1, -1);
|
||||
textRange = new TextRange(-1, -1);
|
||||
return;
|
||||
}
|
||||
var firstTextBlock = atomicTextBlocks.get(0);
|
||||
this.atomicTextBlocks.add(firstTextBlock);
|
||||
boundary = new Boundary(firstTextBlock.getBoundary().start(), firstTextBlock.getBoundary().end());
|
||||
textRange = new TextRange(firstTextBlock.getTextRange().start(), firstTextBlock.getTextRange().end());
|
||||
|
||||
atomicTextBlocks.subList(1, atomicTextBlocks.size()).forEach(this::concat);
|
||||
}
|
||||
@ -50,16 +50,16 @@ public class ConcatenatedTextBlock implements TextBlock {
|
||||
|
||||
public ConcatenatedTextBlock concat(TextBlock textBlock) {
|
||||
|
||||
int start = textBlock.getBoundary().start();
|
||||
int end = textBlock.getBoundary().end();
|
||||
int start = textBlock.getTextRange().start();
|
||||
int end = textBlock.getTextRange().end();
|
||||
if (this.atomicTextBlocks.isEmpty()) {
|
||||
boundary.setStart(start);
|
||||
boundary.setEnd(end);
|
||||
} else if (boundary.end() != start) {
|
||||
throw new UnsupportedOperationException(format("Can only concat consecutive TextBlocks, trying to concat %s and %s", boundary, textBlock.getBoundary()));
|
||||
textRange.setStart(start);
|
||||
textRange.setEnd(end);
|
||||
} else if (textRange.end() != start) {
|
||||
throw new UnsupportedOperationException(format("Can only concat consecutive TextBlocks, trying to concat %s and %s", textRange, textBlock.getTextRange()));
|
||||
}
|
||||
this.atomicTextBlocks.addAll(textBlock.getAtomicTextBlocks());
|
||||
boundary.setEnd(end);
|
||||
textRange.setEnd(end);
|
||||
this.searchText = null;
|
||||
return this;
|
||||
}
|
||||
@ -67,13 +67,13 @@ public class ConcatenatedTextBlock implements TextBlock {
|
||||
|
||||
private AtomicTextBlock getAtomicTextBlockByStringIndex(int stringIdx) {
|
||||
|
||||
return atomicTextBlocks.stream().filter(textBlock -> textBlock.getBoundary().contains(stringIdx)).findAny().orElseThrow(IndexOutOfBoundsException::new);
|
||||
return atomicTextBlocks.stream().filter(textBlock -> textBlock.getTextRange().contains(stringIdx)).findAny().orElseThrow(IndexOutOfBoundsException::new);
|
||||
}
|
||||
|
||||
|
||||
private List<AtomicTextBlock> getAllAtomicTextBlocksPartiallyInStringBoundary(Boundary boundary) {
|
||||
private List<AtomicTextBlock> getAllAtomicTextBlocksPartiallyInStringBoundary(TextRange textRange) {
|
||||
|
||||
return atomicTextBlocks.stream().filter(tb -> tb.getBoundary().intersects(boundary)).toList();
|
||||
return atomicTextBlocks.stream().filter(tb -> tb.getTextRange().intersects(textRange)).toList();
|
||||
}
|
||||
|
||||
|
||||
@ -125,47 +125,47 @@ public class ConcatenatedTextBlock implements TextBlock {
|
||||
|
||||
|
||||
@Override
|
||||
public List<Rectangle2D> getPositions(Boundary stringBoundary) {
|
||||
public List<Rectangle2D> getPositions(TextRange stringTextRange) {
|
||||
|
||||
List<AtomicTextBlock> textBlocks = getAllAtomicTextBlocksPartiallyInStringBoundary(stringBoundary);
|
||||
List<AtomicTextBlock> textBlocks = getAllAtomicTextBlocksPartiallyInStringBoundary(stringTextRange);
|
||||
|
||||
if (textBlocks.size() == 1) {
|
||||
return textBlocks.get(0).getPositions(stringBoundary);
|
||||
return textBlocks.get(0).getPositions(stringTextRange);
|
||||
}
|
||||
|
||||
AtomicTextBlock firstTextBlock = textBlocks.get(0);
|
||||
List<Rectangle2D> positions = new LinkedList<>(firstTextBlock.getPositions(new Boundary(stringBoundary.start(), firstTextBlock.getBoundary().end())));
|
||||
List<Rectangle2D> positions = new LinkedList<>(firstTextBlock.getPositions(new TextRange(stringTextRange.start(), firstTextBlock.getTextRange().end())));
|
||||
|
||||
for (AtomicTextBlock textBlock : textBlocks.subList(1, textBlocks.size() - 1)) {
|
||||
positions.addAll(textBlock.getPositions());
|
||||
}
|
||||
|
||||
var lastTextBlock = textBlocks.get(textBlocks.size() - 1);
|
||||
positions.addAll(lastTextBlock.getPositions(new Boundary(lastTextBlock.getBoundary().start(), stringBoundary.end())));
|
||||
positions.addAll(lastTextBlock.getPositions(new TextRange(lastTextBlock.getTextRange().start(), stringTextRange.end())));
|
||||
|
||||
return positions;
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public Map<Page, List<Rectangle2D>> getPositionsPerPage(Boundary stringBoundary) {
|
||||
public Map<Page, List<Rectangle2D>> getPositionsPerPage(TextRange stringTextRange) {
|
||||
|
||||
List<AtomicTextBlock> textBlocks = getAllAtomicTextBlocksPartiallyInStringBoundary(stringBoundary);
|
||||
List<AtomicTextBlock> textBlocks = getAllAtomicTextBlocksPartiallyInStringBoundary(stringTextRange);
|
||||
|
||||
if (textBlocks.size() == 1) {
|
||||
return textBlocks.get(0).getPositionsPerPage(stringBoundary);
|
||||
return textBlocks.get(0).getPositionsPerPage(stringTextRange);
|
||||
}
|
||||
|
||||
AtomicTextBlock firstTextBlock = textBlocks.get(0);
|
||||
Map<Page, List<Rectangle2D>> rectanglesPerLinePerPage = firstTextBlock.getPositionsPerPage(new Boundary(stringBoundary.start(), firstTextBlock.getBoundary().end()));
|
||||
Map<Page, List<Rectangle2D>> rectanglesPerLinePerPage = firstTextBlock.getPositionsPerPage(new TextRange(stringTextRange.start(), firstTextBlock.getTextRange().end()));
|
||||
|
||||
for (AtomicTextBlock textBlock : textBlocks.subList(1, textBlocks.size() - 1)) {
|
||||
rectanglesPerLinePerPage = mergeEntityPositionsWithSamePageNode(rectanglesPerLinePerPage, textBlock.getPositionsPerPage(textBlock.getBoundary()));
|
||||
rectanglesPerLinePerPage = mergeEntityPositionsWithSamePageNode(rectanglesPerLinePerPage, textBlock.getPositionsPerPage(textBlock.getTextRange()));
|
||||
}
|
||||
|
||||
AtomicTextBlock lastTextBlock = textBlocks.get(textBlocks.size() - 1);
|
||||
rectanglesPerLinePerPage = mergeEntityPositionsWithSamePageNode(rectanglesPerLinePerPage,
|
||||
lastTextBlock.getPositionsPerPage(new Boundary(lastTextBlock.getBoundary().start(), stringBoundary.end())));
|
||||
lastTextBlock.getPositionsPerPage(new TextRange(lastTextBlock.getTextRange().start(), stringTextRange.end())));
|
||||
|
||||
return rectanglesPerLinePerPage;
|
||||
}
|
||||
@ -187,14 +187,14 @@ public class ConcatenatedTextBlock implements TextBlock {
|
||||
|
||||
|
||||
@Override
|
||||
public List<Boundary> getBoldTextBoundaries() {
|
||||
public List<TextRange> getBoldTextBoundaries() {
|
||||
|
||||
return getAtomicTextBlocks().stream().map(AtomicTextBlock::getBoldTextBoundaries).flatMap(Collection::stream).toList();
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public List<Boundary> getItalicTextBoundaries() {
|
||||
public List<TextRange> getItalicTextBoundaries() {
|
||||
|
||||
return getAtomicTextBlocks().stream().map(AtomicTextBlock::getItalicTextBoundaries).flatMap(Collection::stream).toList();
|
||||
}
|
||||
|
||||
@ -10,7 +10,7 @@ import java.util.Map;
|
||||
import java.util.Set;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.Boundary;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.TextRange;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Page;
|
||||
|
||||
public interface TextBlock extends CharSequence {
|
||||
@ -21,10 +21,10 @@ public interface TextBlock extends CharSequence {
|
||||
List<AtomicTextBlock> getAtomicTextBlocks();
|
||||
|
||||
|
||||
List<Boundary> getBoldTextBoundaries();
|
||||
List<TextRange> getBoldTextBoundaries();
|
||||
|
||||
|
||||
List<Boundary> getItalicTextBoundaries();
|
||||
List<TextRange> getItalicTextBoundaries();
|
||||
|
||||
|
||||
String getOrientation();
|
||||
@ -33,7 +33,7 @@ public interface TextBlock extends CharSequence {
|
||||
int getTextDirection();
|
||||
|
||||
|
||||
Boundary getBoundary();
|
||||
TextRange getTextRange();
|
||||
|
||||
|
||||
int getNextLinebreak(int fromIndex);
|
||||
@ -48,10 +48,10 @@ public interface TextBlock extends CharSequence {
|
||||
Rectangle2D getPosition(int stringIdx);
|
||||
|
||||
|
||||
List<Rectangle2D> getPositions(Boundary stringBoundary);
|
||||
List<Rectangle2D> getPositions(TextRange stringTextRange);
|
||||
|
||||
|
||||
Map<Page, List<Rectangle2D>> getPositionsPerPage(Boundary stringBoundary);
|
||||
Map<Page, List<Rectangle2D>> getPositionsPerPage(TextRange stringTextRange);
|
||||
|
||||
|
||||
int numberOfLines();
|
||||
@ -59,7 +59,7 @@ public interface TextBlock extends CharSequence {
|
||||
|
||||
default int indexOf(String searchTerm) {
|
||||
|
||||
return indexOf(searchTerm, getBoundary().start());
|
||||
return indexOf(searchTerm, getTextRange().start());
|
||||
}
|
||||
|
||||
|
||||
@ -69,10 +69,10 @@ public interface TextBlock extends CharSequence {
|
||||
}
|
||||
|
||||
|
||||
default Set<Page> getPages(Boundary boundary) {
|
||||
default Set<Page> getPages(TextRange textRange) {
|
||||
|
||||
return getAtomicTextBlocks().stream()
|
||||
.filter(atomicTextBlock -> atomicTextBlock.getBoundary().intersects(boundary))
|
||||
.filter(atomicTextBlock -> atomicTextBlock.getTextRange().intersects(textRange))
|
||||
.map(AtomicTextBlock::getPage)
|
||||
.collect(Collectors.toUnmodifiableSet());
|
||||
}
|
||||
@ -80,38 +80,38 @@ public interface TextBlock extends CharSequence {
|
||||
|
||||
default int indexOf(String searchTerm, int startOffset) {
|
||||
|
||||
int start = getSearchText().indexOf(searchTerm, startOffset - getBoundary().start());
|
||||
int start = getSearchText().indexOf(searchTerm, startOffset - getTextRange().start());
|
||||
if (start == -1) {
|
||||
return -1;
|
||||
}
|
||||
return start + getBoundary().start();
|
||||
return start + getTextRange().start();
|
||||
}
|
||||
|
||||
|
||||
default CharSequence getFirstLine() {
|
||||
|
||||
return subSequence(getBoundary().start(), getNextLinebreak(getBoundary().start()));
|
||||
return subSequence(getTextRange().start(), getNextLinebreak(getTextRange().start()));
|
||||
}
|
||||
|
||||
|
||||
default boolean containsBoundary(Boundary boundary) {
|
||||
default boolean containsBoundary(TextRange textRange) {
|
||||
|
||||
if (boundary.end() < boundary.start()) {
|
||||
throw new IllegalArgumentException(format("Invalid %s, StartIndex must be smaller than EndIndex", boundary));
|
||||
if (textRange.end() < textRange.start()) {
|
||||
throw new IllegalArgumentException(format("Invalid %s, StartIndex must be smaller than EndIndex", textRange));
|
||||
}
|
||||
return getBoundary().contains(boundary);
|
||||
return getTextRange().contains(textRange);
|
||||
}
|
||||
|
||||
|
||||
default boolean containsIndex(int stringIndex) {
|
||||
|
||||
return getBoundary().contains(stringIndex);
|
||||
return getTextRange().contains(stringIndex);
|
||||
}
|
||||
|
||||
|
||||
default CharSequence subSequence(Boundary boundary) {
|
||||
default CharSequence subSequence(TextRange textRange) {
|
||||
|
||||
return subSequence(boundary.start(), boundary.end());
|
||||
return subSequence(textRange.start(), textRange.end());
|
||||
}
|
||||
|
||||
|
||||
@ -128,21 +128,21 @@ public interface TextBlock extends CharSequence {
|
||||
@Override
|
||||
default CharSequence subSequence(int start, int end) {
|
||||
|
||||
return getSearchText().substring(start - getBoundary().start(), end - getBoundary().start());
|
||||
return getSearchText().substring(start - getTextRange().start(), end - getTextRange().start());
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
default int length() {
|
||||
|
||||
return getBoundary().length();
|
||||
return getTextRange().length();
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
default char charAt(int index) {
|
||||
|
||||
return getSearchText().charAt(index - getBoundary().start());
|
||||
return getSearchText().charAt(index - getTextRange().start());
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@ -12,12 +12,14 @@ import org.springframework.stereotype.Service;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
|
||||
|
||||
import io.micrometer.observation.annotation.Observed;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
|
||||
@Service
|
||||
@Slf4j
|
||||
public class OutlineValidationService {
|
||||
|
||||
@Observed(name = "OutlineValidationService", contextualName = "create-toc")
|
||||
public TableOfContents createToC(List<TextPageBlock> headlines) {
|
||||
|
||||
List<TableOfContentItem> mainSections = new ArrayList<>();
|
||||
|
||||
@ -33,7 +33,7 @@ public class BodyTextFrameService {
|
||||
for (ClassificationPage page : classificationDocument.getPages()) {
|
||||
var updatedBodyTextFrame = getBodyTextFrameFromRulings(page, bodyTextFrame, landscapeBodyTextFrame);
|
||||
setBodyTextFrameAdjustedToPage(page, bodyTextFrame, landscapeBodyTextFrame);
|
||||
classificationDocument.getVisualizations().addMainBodyVisualization(page.isLandscape() ? landscapeBodyTextFrame : bodyTextFrame, page.getPageNumber());
|
||||
classificationDocument.getLayoutDebugLayer().addMainBodyVisualization(page.isLandscape() ? landscapeBodyTextFrame : bodyTextFrame, page.getPageNumber());
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@ -17,7 +17,7 @@ import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRul
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.TablePageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.visualization.LayoutparsingVisualizations;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.visualization.LayoutDebugLayer;
|
||||
|
||||
import lombok.RequiredArgsConstructor;
|
||||
|
||||
@ -35,7 +35,7 @@ public class DocstrumBlockificationService {
|
||||
public ClassificationPage blockify(List<TextPositionSequence> textPositions,
|
||||
CleanRulings rulings,
|
||||
boolean xyOrder,
|
||||
LayoutparsingVisualizations visualizations,
|
||||
LayoutDebugLayer visualizations,
|
||||
LayoutParsingType layoutParsingType) {
|
||||
|
||||
CleanRulings usedRulings = rulings.withoutTextRulings();
|
||||
|
||||
@ -1,9 +1,6 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.services.blockification;
|
||||
|
||||
import static java.util.stream.Collectors.toSet;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Comparator;
|
||||
import java.util.Iterator;
|
||||
import java.util.List;
|
||||
|
||||
@ -11,13 +8,11 @@ import org.springframework.stereotype.Service;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.FloatFrequencyCounter;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.Orientation;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.StringFrequencyCounter;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.visualization.LayoutparsingVisualizations;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.visualization.LayoutDebugLayer;
|
||||
|
||||
@SuppressWarnings("all")
|
||||
@Service
|
||||
@ -35,7 +30,7 @@ public class RedactManagerBlockificationService {
|
||||
* @param visualizations
|
||||
* @return Page object that contains the Textblock and text statistics.
|
||||
*/
|
||||
public ClassificationPage blockify(List<TextPositionSequence> textPositions, CleanRulings cleanRulings, LayoutparsingVisualizations visualizations) {
|
||||
public ClassificationPage blockify(List<TextPositionSequence> textPositions, CleanRulings cleanRulings, LayoutDebugLayer visualizations) {
|
||||
|
||||
CleanRulings usedRulings = cleanRulings.withoutTextRulings();
|
||||
|
||||
|
||||
@ -22,6 +22,7 @@ import com.knecon.fforesight.service.layoutparser.processor.model.Classification
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationHeader;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.DocumentTree;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.AbstractSemanticNode;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.DuplicatedParagraph;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Footer;
|
||||
@ -31,7 +32,6 @@ import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.He
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Image;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Page;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Paragraph;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.AbstractSemanticNode;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.AtomicTextBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.image.ClassifiedImage;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.outline.TableOfContentItem;
|
||||
@ -55,7 +55,7 @@ public class DocumentGraphFactory {
|
||||
|
||||
Document documentGraph = new Document();
|
||||
|
||||
documentGraph.setVisualizations(document.getVisualizations());
|
||||
documentGraph.setLayoutDebugLayer(document.getLayoutDebugLayer());
|
||||
|
||||
Context context = new Context(documentGraph);
|
||||
|
||||
@ -280,7 +280,8 @@ public class DocumentGraphFactory {
|
||||
return pages.keySet()
|
||||
.stream()
|
||||
.filter(page -> page.getNumber() == pageIndex)
|
||||
.findFirst().orElseThrow(() -> new NoSuchElementException(format("ClassificationPage with number %d not found", pageIndex)));
|
||||
.findFirst()
|
||||
.orElseThrow(() -> new NoSuchElementException(format("ClassificationPage with number %d not found", pageIndex)));
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@ -4,7 +4,7 @@ import java.awt.geom.Rectangle2D;
|
||||
import java.util.Collections;
|
||||
import java.util.List;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.Boundary;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.TextRange;
|
||||
|
||||
import lombok.AccessLevel;
|
||||
import lombok.Builder;
|
||||
@ -19,8 +19,8 @@ public class SearchTextWithTextPositionDto {
|
||||
String searchText;
|
||||
List<Integer> lineBreaks;
|
||||
List<Integer> stringIdxToPositionIdx;
|
||||
List<Boundary> boldTextBoundaries;
|
||||
List<Boundary> italicTextBoundaries;
|
||||
List<TextRange> boldTextBoundaries;
|
||||
List<TextRange> italicTextBoundaries;
|
||||
List<Rectangle2D> positions;
|
||||
|
||||
|
||||
|
||||
@ -9,7 +9,7 @@ import java.util.List;
|
||||
import java.util.Locale;
|
||||
import java.util.Objects;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.Boundary;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.TextRange;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.RedTextPosition;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextDirection;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
|
||||
@ -118,23 +118,23 @@ public class SearchTextWithTextPositionFactory {
|
||||
}
|
||||
|
||||
|
||||
private static List<Boundary> mergeToBoundaries(List<Integer> integers) {
|
||||
private static List<TextRange> mergeToBoundaries(List<Integer> integers) {
|
||||
|
||||
if (integers.isEmpty()) {
|
||||
return Collections.emptyList();
|
||||
}
|
||||
List<Boundary> boundaries = new LinkedList<>();
|
||||
List<TextRange> boundaries = new LinkedList<>();
|
||||
int start = integers.get(0);
|
||||
int end = integers.get(0) + 1;
|
||||
for (int current : integers) {
|
||||
if (current > end + 1) {
|
||||
boundaries.add(new Boundary(start, end));
|
||||
boundaries.add(new TextRange(start, end));
|
||||
start = current;
|
||||
}
|
||||
end = current + 1;
|
||||
}
|
||||
if (boundaries.isEmpty()) {
|
||||
boundaries.add(new Boundary(start, end));
|
||||
boundaries.add(new TextRange(start, end));
|
||||
}
|
||||
return boundaries;
|
||||
}
|
||||
|
||||
@ -116,8 +116,8 @@ public class DocumentDataMapper {
|
||||
.page(atomicTextBlock.getPage().getNumber().longValue())
|
||||
.searchText(atomicTextBlock.getSearchText())
|
||||
.numberOnPage(atomicTextBlock.getNumberOnPage())
|
||||
.start(atomicTextBlock.getBoundary().start())
|
||||
.end(atomicTextBlock.getBoundary().end())
|
||||
.start(atomicTextBlock.getTextRange().start())
|
||||
.end(atomicTextBlock.getTextRange().end())
|
||||
.lineBreaks(toPrimitiveIntArray(atomicTextBlock.getLineBreaks()))
|
||||
.build();
|
||||
}
|
||||
|
||||
@ -13,7 +13,7 @@ import com.knecon.fforesight.service.layoutparser.internal.api.data.taas.Researc
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.taas.RowData;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.taas.StructureObject;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.taas.TableData;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.Boundary;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.TextRange;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Page;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.SemanticNode;
|
||||
@ -82,15 +82,15 @@ public class TaasDocumentDataMapper {
|
||||
}
|
||||
|
||||
|
||||
private static Range toRange(Boundary boundary) {
|
||||
private static Range toRange(TextRange textRange) {
|
||||
|
||||
return new Range(boundary.start(), boundary.end());
|
||||
return new Range(textRange.start(), textRange.end());
|
||||
}
|
||||
|
||||
|
||||
private static List<Range> toRange(List<Boundary> boundary) {
|
||||
private static List<Range> toRange(List<TextRange> textRange) {
|
||||
|
||||
return boundary.stream().map(TaasDocumentDataMapper::toRange).toList();
|
||||
return textRange.stream().map(TaasDocumentDataMapper::toRange).toList();
|
||||
}
|
||||
|
||||
|
||||
|
||||
@ -1,41 +1,17 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.services.visualization;
|
||||
|
||||
import java.awt.Color;
|
||||
import java.awt.geom.Line2D;
|
||||
import java.awt.geom.Point2D;
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.awt.geom.RectangularShape;
|
||||
import java.io.File;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collections;
|
||||
import java.util.Comparator;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Optional;
|
||||
import java.util.stream.Collectors;
|
||||
import java.util.stream.Stream;
|
||||
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.LayoutEngine;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeType;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.DuplicatedParagraph;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Page;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.SemanticNode;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.SuperSection;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Headline;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Image;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Paragraph;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Table;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.TableCell;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations;
|
||||
import com.knecon.fforesight.service.viewerdoc.ContentStreams;
|
||||
import com.knecon.fforesight.service.viewerdoc.model.ColoredLine;
|
||||
import com.knecon.fforesight.service.viewerdoc.model.ColoredRectangle;
|
||||
import com.knecon.fforesight.service.viewerdoc.model.LayoutGrid;
|
||||
import com.knecon.fforesight.service.viewerdoc.model.PlacedText;
|
||||
import com.knecon.fforesight.service.viewerdoc.model.Standard14EmbeddableFont;
|
||||
import com.knecon.fforesight.service.viewerdoc.model.Visualizations;
|
||||
import com.knecon.fforesight.service.viewerdoc.service.IViewerDocumentService;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.visualization.LayoutGrid;
|
||||
import com.knecon.fforesight.service.viewerdoc.service.PDFTronViewerDocumentService;
|
||||
|
||||
import io.micrometer.observation.annotation.Observed;
|
||||
import lombok.AccessLevel;
|
||||
@ -48,451 +24,41 @@ import lombok.experimental.FieldDefaults;
|
||||
@FieldDefaults(level = AccessLevel.PRIVATE, makeFinal = true)
|
||||
public class LayoutGridService {
|
||||
|
||||
IViewerDocumentService viewerDocumentService;
|
||||
|
||||
static float FONT_SIZE = 10f;
|
||||
static float LINE_WIDTH = 1f;
|
||||
static Standard14EmbeddableFont FONT = Standard14EmbeddableFont.helvetica();
|
||||
|
||||
static Color INNER_LINES_COLOR = new Color(255, 175, 175);
|
||||
static Color PARAGRAPH_COLOR = new Color(70, 130, 180);
|
||||
|
||||
static Color DUPLICATE_PARAGRAPH_COLOR = new Color(70, 180, 101);
|
||||
static Color TABLE_COLOR = new Color(102, 205, 170);
|
||||
static Color SECTION_COLOR = new Color(50, 50, 50);
|
||||
static Color HEADLINE_COLOR = new Color(162, 56, 56);
|
||||
static Color HEADER_COLOR = new Color(171, 131, 6);
|
||||
static Color IMAGE_COLOR = new Color(253, 63, 146);
|
||||
|
||||
private record RectangleIdentifier(List<Integer> treeId, Integer pageNumber) {
|
||||
|
||||
}
|
||||
|
||||
HashMap<RectangleIdentifier, Rectangle2D> rectangleMap = new HashMap<>();
|
||||
PDFTronViewerDocumentService viewerDocumentService;
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
@Observed(name = "ViewerDocumentService", contextualName = "create-viewer-document")
|
||||
public void addLayoutGrid(File originFile, Document document, File destinationFile, boolean layerVisibilityDefaultValue, boolean writeVisualLayoutParsingGrid) {
|
||||
|
||||
List<Visualizations> allVisualizations;
|
||||
Visualizations layoutGrid = this.addLayoutGrid(document, layerVisibilityDefaultValue, false);
|
||||
if (writeVisualLayoutParsingGrid) {
|
||||
Visualizations visualLayoutGrid = this.addLayoutGrid(document, layerVisibilityDefaultValue, true);
|
||||
allVisualizations = Stream.concat(Stream.of(layoutGrid, visualLayoutGrid), document.getVisualizations().streamAll())
|
||||
.toList();
|
||||
LayoutGrid layoutGrid = createLayoutGrid(document);
|
||||
layoutGrid.setVisibleByDefault(layerVisibilityDefaultValue);
|
||||
// Visualizations visualLayoutGrid = this.addLayoutGrid(document, layerVisibilityDefaultValue, true);
|
||||
if (document.getLayoutDebugLayer().isActive()) {
|
||||
viewerDocumentService.addLayerGroups(originFile, destinationFile, List.of(layoutGrid, document.getLayoutDebugLayer()));
|
||||
} else {
|
||||
allVisualizations = Stream.concat(Stream.of(layoutGrid), document.getVisualizations().streamAll())
|
||||
.toList();
|
||||
viewerDocumentService.addLayerGroups(originFile, destinationFile, List.of(layoutGrid));
|
||||
|
||||
}
|
||||
|
||||
viewerDocumentService.addVisualizationsOnPage(originFile, destinationFile, allVisualizations);
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
@Observed(name = "ViewerDocumentService", contextualName = "create-viewer-document")
|
||||
public Visualizations addLayoutGrid(Document document, boolean layerVisibilityDefaultValue, boolean visualParsingGrid) {
|
||||
private LayoutGrid createLayoutGrid(Document document) {
|
||||
|
||||
LayoutGrid layoutGrid = createLayoutGrid(document, visualParsingGrid);
|
||||
|
||||
return Visualizations.builder()
|
||||
.layer(visualParsingGrid ? ContentStreams.KNECON_VISUAL_PARSING : ContentStreams.KNECON_LAYOUT)
|
||||
.visualizationsOnPages(layoutGrid.getVisualizationsPerPages())
|
||||
.layerVisibilityDefaultValue(layerVisibilityDefaultValue)
|
||||
.build();
|
||||
}
|
||||
|
||||
|
||||
private LayoutGrid createLayoutGrid(Document document, boolean visualParsingGrid) {
|
||||
|
||||
LayoutGrid layoutGrid = new LayoutGrid(document.getNumberOfPages());
|
||||
LayoutGrid layoutGrid = new LayoutGrid();
|
||||
document.streamAllSubNodes()
|
||||
.filter(node -> (node.getEngines().contains(LayoutEngine.AI) && visualParsingGrid) || (node.getEngines().contains(LayoutEngine.ALGORITHM) && !visualParsingGrid))
|
||||
.peek(layoutGrid::addTreeId)
|
||||
.forEach(semanticNode -> {
|
||||
Color color = switch (semanticNode.getType()) {
|
||||
case PARAGRAPH -> PARAGRAPH_COLOR;
|
||||
case TABLE -> TABLE_COLOR;
|
||||
case SECTION, SUPER_SECTION -> SECTION_COLOR;
|
||||
case HEADLINE -> HEADLINE_COLOR;
|
||||
case HEADER, FOOTER -> HEADER_COLOR;
|
||||
case IMAGE -> IMAGE_COLOR;
|
||||
default -> null;
|
||||
};
|
||||
|
||||
if (semanticNode instanceof DuplicatedParagraph) {
|
||||
color = DUPLICATE_PARAGRAPH_COLOR;
|
||||
}
|
||||
|
||||
if (isNotSectionOrTableCellOrDocument(semanticNode)) {
|
||||
addAsRectangle(semanticNode, layoutGrid, color);
|
||||
}
|
||||
if (semanticNode.getType().equals(NodeType.SECTION) || semanticNode.getType().equals(NodeType.SUPER_SECTION)) {
|
||||
addSection(semanticNode, layoutGrid, color);
|
||||
}
|
||||
if (semanticNode.getType().equals(NodeType.TABLE)) {
|
||||
Table table = (Table) semanticNode;
|
||||
addInnerTableLines(table, layoutGrid);
|
||||
switch (semanticNode.getType()) {
|
||||
case SECTION, SUPER_SECTION -> layoutGrid.addSection(semanticNode);
|
||||
case HEADLINE -> layoutGrid.addHeadline((Headline) semanticNode);
|
||||
case PARAGRAPH -> layoutGrid.addParagraph((Paragraph) semanticNode);
|
||||
case TABLE -> layoutGrid.addTable((Table) semanticNode);
|
||||
case IMAGE -> layoutGrid.addImage((Image) semanticNode);
|
||||
case HEADER, FOOTER -> layoutGrid.addHeaderOrFooter(semanticNode);
|
||||
}
|
||||
});
|
||||
return layoutGrid;
|
||||
}
|
||||
|
||||
|
||||
private void addInnerTableLines(Table table, LayoutGrid layoutGrid) {
|
||||
|
||||
if (table.getNumberOfCols() < 1 || table.getNumberOfRows() < 1) {
|
||||
return;
|
||||
}
|
||||
for (Page page : table.getPages()) {
|
||||
|
||||
Optional<Integer> optionalFirstRowOnPage = table.streamCol(0)
|
||||
.filter(tableCell -> tableCell.isOnPage(page.getNumber()))
|
||||
.map(TableCell::getRow)
|
||||
.findFirst();
|
||||
if (optionalFirstRowOnPage.isEmpty()) {
|
||||
continue;
|
||||
}
|
||||
int firstRowOnPage = optionalFirstRowOnPage.get();
|
||||
|
||||
Stream<Double> xStream = switch (page.getRotation()) {
|
||||
case 90 -> streamBBoxOfCellsOnPage(table.streamCol(0), page).map(RectangularShape::getMinX);
|
||||
case 180 -> streamBBoxOfCellsOnPage(table.streamRow(firstRowOnPage), page).map(RectangularShape::getMaxX);
|
||||
case 270 -> streamBBoxOfCellsOnPage(table.streamCol(0), page).map(RectangularShape::getMaxX);
|
||||
default -> streamBBoxOfCellsOnPage(table.streamRow(firstRowOnPage), page).map(RectangularShape::getMinX);
|
||||
};
|
||||
List<Double> xs = xStream.collect(Collectors.toList());
|
||||
xs.remove(0);
|
||||
|
||||
Stream<Double> yStream = switch (page.getRotation()) {
|
||||
case 90 -> streamBBoxOfCellsOnPage(table.streamRow(firstRowOnPage), page).map(RectangularShape::getMinY);
|
||||
case 180 -> streamBBoxOfCellsOnPage(table.streamCol(0), page).map(RectangularShape::getMinY);
|
||||
case 270 -> streamBBoxOfCellsOnPage(table.streamRow(firstRowOnPage), page).map(RectangularShape::getMaxY);
|
||||
default -> streamBBoxOfCellsOnPage(table.streamCol(0), page).map(RectangularShape::getMaxY);
|
||||
};
|
||||
List<Double> ys = yStream.collect(Collectors.toList());
|
||||
ys.remove(0);
|
||||
|
||||
Rectangle2D tableBBox = table.getBBox()
|
||||
.get(page);
|
||||
List<ColoredLine> coloredLines = layoutGrid.getVisualizationsPerPages()
|
||||
.get(page.getNumber() - 1).getColoredLines();
|
||||
xs.forEach(x -> {
|
||||
Line2D line = new Line2D.Double(new Point2D.Double(x, tableBBox.getMaxY()), new Point2D.Double(x, tableBBox.getMinY()));
|
||||
coloredLines.add(new ColoredLine(line, INNER_LINES_COLOR, LINE_WIDTH));
|
||||
});
|
||||
ys.forEach(y -> {
|
||||
Line2D line = new Line2D.Double(new Point2D.Double(tableBBox.getMinX(), y), new Point2D.Double(tableBBox.getMaxX(), y));
|
||||
coloredLines.add(new ColoredLine(line, INNER_LINES_COLOR, LINE_WIDTH));
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
private static Stream<Rectangle2D> streamBBoxOfCellsOnPage(Stream<TableCell> table, Page page) {
|
||||
|
||||
return table.filter(tableCell -> tableCell.isOnPage(page.getNumber()))
|
||||
.map(TableCell::getBBox)
|
||||
.map(bBoxMap -> bBoxMap.get(page));
|
||||
}
|
||||
|
||||
|
||||
private void addSection(SemanticNode semanticNode, LayoutGrid layoutGrid, Color color) {
|
||||
|
||||
Map<Page, Rectangle2D> bBoxMap = semanticNode.getBBox();
|
||||
|
||||
List<SemanticNode> subSections = semanticNode.streamAllSubNodesOfType(NodeType.SECTION)
|
||||
.toList();
|
||||
Integer maxChildDepth = subSections.stream()
|
||||
.map(node -> node.getTreeId().size())
|
||||
.max(Integer::compareTo)
|
||||
.orElse(semanticNode.getTreeId().size());
|
||||
int ownDepth = semanticNode.getTreeId().size();
|
||||
|
||||
Page firstPage = semanticNode.getFirstPage();
|
||||
String treeIdString = buildTreeIdString(semanticNode);
|
||||
|
||||
if (bBoxMap.values().size() == 1) {
|
||||
handleSinglePage(semanticNode, layoutGrid, color, firstPage, bBoxMap.get(firstPage), treeIdString, maxChildDepth, ownDepth);
|
||||
return;
|
||||
}
|
||||
List<Page> pagesInOrder = bBoxMap.keySet()
|
||||
.stream()
|
||||
.sorted(Comparator.comparingInt(Page::getNumber))
|
||||
.collect(Collectors.toList());
|
||||
pagesInOrder.remove(0);
|
||||
handleFirstPageOfSection(semanticNode, color, firstPage, bBoxMap.get(firstPage), treeIdString, layoutGrid, maxChildDepth, ownDepth);
|
||||
if (semanticNode instanceof SuperSection) {
|
||||
return;
|
||||
}
|
||||
for (Page middlePage : pagesInOrder.subList(0, pagesInOrder.size() - 1)) {
|
||||
handleForMiddlePageOfSection(semanticNode, color, middlePage, bBoxMap.get(middlePage), treeIdString, layoutGrid, maxChildDepth, ownDepth);
|
||||
}
|
||||
var lastPage = pagesInOrder.remove(pagesInOrder.size() - 1);
|
||||
handleLastPageOfSection(semanticNode, color, lastPage, bBoxMap.get(lastPage), treeIdString, layoutGrid, maxChildDepth, ownDepth);
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
private void addPlacedText(Page page, Rectangle2D textBBox, Rectangle2D highestParentRect, String s, LayoutGrid layoutGrid, Integer maxChildDepth) {
|
||||
|
||||
// translates text, such that its right edge is a bit to the left of the drawn box
|
||||
float translationAmount = ((FONT.getStringWidth(s) / 1000) * FONT_SIZE + LINE_WIDTH + 2 * maxChildDepth);
|
||||
|
||||
Point2D upperLeftCorner;
|
||||
Point2D translationVector;
|
||||
switch (page.getRotation()) {
|
||||
case 90 -> {
|
||||
if (highestParentRect != null) {
|
||||
upperLeftCorner = new Point2D.Double(highestParentRect.getMinX(), textBBox.getMinY());
|
||||
} else {
|
||||
upperLeftCorner = new Point2D.Double(textBBox.getMinX(), textBBox.getMinY());
|
||||
}
|
||||
translationVector = new Point2D.Double(FONT_SIZE, -translationAmount);
|
||||
}
|
||||
case 180 -> {
|
||||
if (highestParentRect != null) {
|
||||
upperLeftCorner = new Point2D.Double(highestParentRect.getMaxX(), textBBox.getMinY());
|
||||
} else {
|
||||
upperLeftCorner = new Point2D.Double(textBBox.getMaxX(), textBBox.getMinY());
|
||||
}
|
||||
translationVector = new Point2D.Double(translationAmount, FONT_SIZE);
|
||||
}
|
||||
case 270 -> {
|
||||
|
||||
if (highestParentRect != null) {
|
||||
upperLeftCorner = new Point2D.Double(highestParentRect.getMaxX(), textBBox.getMaxY());
|
||||
} else {
|
||||
upperLeftCorner = new Point2D.Double(textBBox.getMaxX(), textBBox.getMaxY());
|
||||
}
|
||||
translationVector = new Point2D.Double(-FONT_SIZE, translationAmount);
|
||||
}
|
||||
default -> {
|
||||
|
||||
if (highestParentRect != null) {
|
||||
upperLeftCorner = new Point2D.Double(highestParentRect.getMinX(), textBBox.getMaxY());
|
||||
} else {
|
||||
upperLeftCorner = new Point2D.Double(textBBox.getMinX(), textBBox.getMaxY());
|
||||
}
|
||||
translationVector = new Point2D.Double(-translationAmount, -FONT_SIZE);
|
||||
}
|
||||
}
|
||||
|
||||
upperLeftCorner = add(upperLeftCorner, translationVector);
|
||||
|
||||
List<PlacedText> placedTexts = layoutGrid.getVisualizationsPerPages()
|
||||
.get(page.getNumber() - 1).getPlacedTexts();
|
||||
|
||||
PlacedText newText = PlacedText.textFacingUp(s, upperLeftCorner, FONT_SIZE, Color.BLACK, FONT);
|
||||
|
||||
Optional<PlacedText> conflictingText = placedTexts.stream()
|
||||
.filter(pt -> Math.abs(pt.lineStart().getY() - newText.lineStart().getY()) <= FONT_SIZE)
|
||||
.findFirst();
|
||||
|
||||
if (conflictingText.isPresent()) {
|
||||
PlacedText existingText = conflictingText.get();
|
||||
if (newText.text().length() > existingText.text().length()) {
|
||||
placedTexts.remove(existingText);
|
||||
placedTexts.add(newText);
|
||||
}
|
||||
} else {
|
||||
placedTexts.add(newText);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
private void handleSinglePage(SemanticNode semanticNode,
|
||||
LayoutGrid layoutGrid,
|
||||
Color color,
|
||||
Page page,
|
||||
Rectangle2D rectangle2D,
|
||||
String treeIdString,
|
||||
Integer maxChildDepth,
|
||||
Integer ownDepth) {
|
||||
|
||||
RectangleAndLinesResult result = createLinesAndPlaceText(semanticNode, page, rectangle2D, treeIdString, layoutGrid, maxChildDepth, ownDepth);
|
||||
// add string to top line
|
||||
var firstLine = result.pageLines().remove(0);
|
||||
result.coloredLines().add(new ColoredLine(firstLine, color, LINE_WIDTH));
|
||||
for (Line2D line : result.pageLines()) {
|
||||
result.coloredLines().add(new ColoredLine(line, color, LINE_WIDTH));
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
private void handleFirstPageOfSection(SemanticNode semanticNode,
|
||||
Color color,
|
||||
Page firstPage,
|
||||
Rectangle2D rectangle2D,
|
||||
String treeIdString,
|
||||
LayoutGrid layoutGrid,
|
||||
Integer maxChildDepth,
|
||||
Integer ownDepth) {
|
||||
|
||||
RectangleAndLinesResult result = createLinesAndPlaceText(semanticNode, firstPage, rectangle2D, treeIdString, layoutGrid, maxChildDepth, ownDepth);
|
||||
// remove bottom line
|
||||
result.pageLines().remove(2);
|
||||
// add string to top line
|
||||
var firstLine = result.pageLines().remove(0);
|
||||
result.coloredLines().add(new ColoredLine(firstLine, color, LINE_WIDTH));
|
||||
for (Line2D line : result.pageLines()) {
|
||||
result.coloredLines().add(new ColoredLine(line, color, LINE_WIDTH));
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
private void handleForMiddlePageOfSection(SemanticNode semanticNode,
|
||||
Color color,
|
||||
Page middlePage,
|
||||
Rectangle2D rectangle2D,
|
||||
String treeIdString,
|
||||
LayoutGrid layoutGrid,
|
||||
Integer maxChildDepth,
|
||||
Integer ownDepth) {
|
||||
|
||||
RectangleAndLinesResult result = createLinesAndPlaceText(semanticNode, middlePage, rectangle2D, treeIdString, layoutGrid, maxChildDepth, ownDepth);
|
||||
// remove top line
|
||||
result.pageLines().remove(0);
|
||||
// remove bottom line
|
||||
result.pageLines().remove(1);
|
||||
// add string to left line
|
||||
var leftLine = result.pageLines().remove(1);
|
||||
result.coloredLines().add(new ColoredLine(leftLine, color, LINE_WIDTH));
|
||||
for (Line2D line : result.pageLines()) {
|
||||
result.coloredLines().add(new ColoredLine(line, color, LINE_WIDTH));
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
private void handleLastPageOfSection(SemanticNode semanticNode,
|
||||
Color color,
|
||||
Page lastPage,
|
||||
Rectangle2D rectangle2D,
|
||||
String treeIdString,
|
||||
LayoutGrid layoutGrid,
|
||||
Integer maxChildDepth,
|
||||
Integer ownDepth) {
|
||||
|
||||
RectangleAndLinesResult result = createLinesAndPlaceText(semanticNode, lastPage, rectangle2D, treeIdString, layoutGrid, maxChildDepth, ownDepth);
|
||||
// remove top line
|
||||
result.pageLines().remove(0);
|
||||
// add string to left line
|
||||
var leftLine = result.pageLines().remove(2);
|
||||
result.coloredLines().add(new ColoredLine(leftLine, color, LINE_WIDTH));
|
||||
for (Line2D line : result.pageLines()) {
|
||||
result.coloredLines().add(new ColoredLine(line, color, LINE_WIDTH));
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
private RectangleAndLinesResult createLinesAndPlaceText(SemanticNode semanticNode,
|
||||
Page page,
|
||||
Rectangle2D rectangle2D,
|
||||
String treeIdString,
|
||||
LayoutGrid layoutGrid,
|
||||
Integer maxChildDepth,
|
||||
Integer ownDepth) {
|
||||
|
||||
List<ColoredLine> coloredLines = layoutGrid.getVisualizationsPerPages()
|
||||
.get(page.getNumber() - 1).getColoredLines();
|
||||
int lineWidthModifier = maxChildDepth - ownDepth;
|
||||
Rectangle2D r = RectangleTransformations.pad(semanticNode.getBBox()
|
||||
.get(page), LINE_WIDTH * (1 + lineWidthModifier), LINE_WIDTH * (1 + lineWidthModifier));
|
||||
var lastPageLines = createLinesFromRectangle(r, page.getRotation());
|
||||
|
||||
SemanticNode highestParent = semanticNode.getHighestParent();
|
||||
Rectangle2D highestParentRect = rectangleMap.get(new RectangleIdentifier(highestParent.getTreeId(), page.getNumber()));
|
||||
addPlacedText(page, rectangle2D, highestParentRect, treeIdString, layoutGrid, maxChildDepth);
|
||||
|
||||
if (semanticNode instanceof SuperSection) {
|
||||
rectangleMap.put(new RectangleIdentifier(semanticNode.getTreeId(), page.getNumber()), r);
|
||||
}
|
||||
|
||||
return new RectangleAndLinesResult(coloredLines, r, lastPageLines);
|
||||
}
|
||||
|
||||
|
||||
private record RectangleAndLinesResult(List<ColoredLine> coloredLines, Rectangle2D rectangle, List<Line2D> pageLines) {
|
||||
|
||||
}
|
||||
|
||||
|
||||
private String buildTreeIdString(SemanticNode semanticNode) {
|
||||
|
||||
return semanticNode.getTreeId()
|
||||
.stream()
|
||||
.map(Object::toString)
|
||||
.collect(Collectors.joining("."));
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
A __________________ B
|
||||
| |
|
||||
| |
|
||||
| |
|
||||
| |
|
||||
D|__________________| C
|
||||
The returned List are the lines [AB, BC, DC, AD]
|
||||
The List is reordered, such that the order of the returned lines are always as viewed on the page.
|
||||
*/
|
||||
private List<Line2D> createLinesFromRectangle(Rectangle2D r, int pageRotation) {
|
||||
// +0.5 to join the lines
|
||||
List<Line2D> lines = new ArrayList<>(4);
|
||||
float lineWidthCorrection = LINE_WIDTH * 0.5f;
|
||||
Point2D.Float a = new Point2D.Float((float) r.getMinX(), (float) r.getMaxY());
|
||||
Point2D.Float a1 = new Point2D.Float((float) r.getMinX() - lineWidthCorrection, (float) r.getMaxY());
|
||||
Point2D.Float b = new Point2D.Float((float) r.getMaxX(), (float) r.getMaxY());
|
||||
Point2D.Float b1 = new Point2D.Float((float) r.getMaxX() + lineWidthCorrection, (float) r.getMaxY());
|
||||
Point2D.Float c = new Point2D.Float((float) r.getMaxX(), (float) r.getMinY());
|
||||
Point2D.Float c1 = new Point2D.Float((float) r.getMaxX() + lineWidthCorrection, (float) r.getMinY());
|
||||
Point2D.Float d = new Point2D.Float((float) r.getMinX(), (float) r.getMinY());
|
||||
Point2D.Float d1 = new Point2D.Float((float) r.getMinX() - lineWidthCorrection, (float) r.getMinY());
|
||||
lines.add(new Line2D.Float(a1, b1));
|
||||
lines.add(new Line2D.Float(b, c));
|
||||
lines.add(new Line2D.Float(d1, c1));
|
||||
lines.add(new Line2D.Float(a, d));
|
||||
|
||||
return switch (pageRotation) {
|
||||
case 90 -> {
|
||||
Collections.rotate(lines, 1);
|
||||
yield lines;
|
||||
}
|
||||
case 180 -> {
|
||||
Collections.rotate(lines, 2);
|
||||
yield lines;
|
||||
}
|
||||
case 270 -> {
|
||||
Collections.rotate(lines, 3);
|
||||
yield lines;
|
||||
}
|
||||
|
||||
default -> lines;
|
||||
};
|
||||
}
|
||||
|
||||
|
||||
private static boolean isNotSectionOrTableCellOrDocument(SemanticNode semanticNode) {
|
||||
|
||||
return !(semanticNode.getType().equals(NodeType.DOCUMENT)
|
||||
|| semanticNode.getType().equals(NodeType.SECTION)
|
||||
|| semanticNode.getType().equals(NodeType.SUPER_SECTION)
|
||||
|| semanticNode.getType().equals(NodeType.TABLE_CELL));
|
||||
}
|
||||
|
||||
|
||||
private void addAsRectangle(SemanticNode semanticNode, LayoutGrid layoutGrid, Color color) {
|
||||
|
||||
semanticNode.getBBox()
|
||||
.forEach((page, textBBox) -> layoutGrid.getVisualizationsPerPages()
|
||||
.get(page.getNumber() - 1).getColoredRectangles().add(new ColoredRectangle(textBBox, color, LINE_WIDTH)));
|
||||
}
|
||||
|
||||
|
||||
private Point2D add(Point2D a, Point2D b) {
|
||||
|
||||
return new Point2D.Double(a.getX() + b.getX(), a.getY() + b.getY());
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@ -0,0 +1,34 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.utils;
|
||||
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Page;
|
||||
|
||||
import lombok.experimental.UtilityClass;
|
||||
|
||||
@UtilityClass
|
||||
public class BBoxMergingUtility {
|
||||
|
||||
public Map<Page, Rectangle2D> mergeBBoxes(List<Map<Page, Rectangle2D>> bboxesToMerge) {
|
||||
|
||||
Map<Page, Rectangle2D> bBoxPerPage = new HashMap<>();
|
||||
Set<Page> pages = bboxesToMerge.stream()
|
||||
.flatMap(map -> map.keySet()
|
||||
.stream())
|
||||
.collect(Collectors.toSet());
|
||||
for (Page page : pages) {
|
||||
Rectangle2D bBoxOnPage = bboxesToMerge.stream()
|
||||
.filter(childBboxPerPage -> childBboxPerPage.containsKey(page))
|
||||
.map(childBboxPerPage -> childBboxPerPage.get(page))
|
||||
.collect(RectangleTransformations.collectBBox());
|
||||
bBoxPerPage.put(page, bBoxOnPage);
|
||||
}
|
||||
return bBoxPerPage;
|
||||
}
|
||||
|
||||
}
|
||||
@ -0,0 +1,111 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.visualization;
|
||||
|
||||
import java.awt.geom.AffineTransform;
|
||||
import java.awt.geom.Line2D;
|
||||
import java.awt.geom.Point2D;
|
||||
import java.awt.geom.Rectangle2D;
|
||||
|
||||
import lombok.experimental.UtilityClass;
|
||||
|
||||
@UtilityClass
|
||||
public class ConnectionLineUtil {
|
||||
|
||||
public static Line2D[] splitRectangleIntoLines(Rectangle2D rect) {
|
||||
|
||||
double x = rect.getX();
|
||||
double y = rect.getY();
|
||||
double width = rect.getWidth();
|
||||
double height = rect.getHeight();
|
||||
|
||||
Line2D[] lines = new Line2D[4];
|
||||
lines[0] = new Line2D.Double(x, y, x + width, y); // Top
|
||||
lines[1] = new Line2D.Double(x + width, y, x + width, y + height); // Right
|
||||
lines[2] = new Line2D.Double(x + width, y + height, x, y + height); // Bottom
|
||||
lines[3] = new Line2D.Double(x, y + height, x, y); // Left
|
||||
|
||||
return lines;
|
||||
}
|
||||
|
||||
|
||||
public static Line2D transform(Line2D line2D, AffineTransform affineTransform) {
|
||||
|
||||
var p1 = affineTransform.transform(line2D.getP1(), null);
|
||||
var p2 = affineTransform.transform(line2D.getP2(), null);
|
||||
return new Line2D.Double(p1, p2);
|
||||
}
|
||||
|
||||
|
||||
public static double length(Line2D line2D) {
|
||||
|
||||
return line2D.getP1().distance(line2D.getP2());
|
||||
}
|
||||
|
||||
|
||||
public static Line2D findClosestMidpointLine(Rectangle2D rect1, Rectangle2D rect2) {
|
||||
|
||||
Line2D[] lines1 = splitRectangleIntoLines(rect1);
|
||||
Line2D[] lines2 = splitRectangleIntoLines(rect2);
|
||||
|
||||
Line2D closestLine1 = null;
|
||||
Line2D closestLine2 = null;
|
||||
double minDistance = Double.MAX_VALUE;
|
||||
|
||||
for (Line2D line1 : lines1) {
|
||||
for (Line2D line2 : lines2) {
|
||||
double distance = lineDistance(line1, line2);
|
||||
if (distance < minDistance) {
|
||||
minDistance = distance;
|
||||
closestLine1 = line1;
|
||||
closestLine2 = line2;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (closestLine1 == null || closestLine2 == null) {
|
||||
throw new IllegalStateException("Could not find closest lines");
|
||||
}
|
||||
|
||||
Point2D midpoint1 = getMidpoint(closestLine1);
|
||||
Point2D midpoint2 = getMidpoint(closestLine2);
|
||||
|
||||
return new Line2D.Double(midpoint1, midpoint2);
|
||||
}
|
||||
|
||||
|
||||
private static double lineDistance(Line2D line1, Line2D line2) {
|
||||
|
||||
return Math.abs(getMidpoint(line1).distance(getMidpoint(line2)));
|
||||
}
|
||||
|
||||
|
||||
private static Point2D getMidpoint(Line2D line) {
|
||||
|
||||
double x = (line.getX1() + line.getX2()) / 2;
|
||||
double y = (line.getY1() + line.getY2()) / 2;
|
||||
return new Point2D.Double(x, y);
|
||||
}
|
||||
|
||||
|
||||
public static Line2D[] createArrowHead(Line2D line, double arrowLength) {
|
||||
|
||||
Point2D start = line.getP1();
|
||||
Point2D end = line.getP2();
|
||||
|
||||
// Calculate the angle of the line
|
||||
double angle = Math.atan2(end.getY() - start.getY(), end.getX() - start.getX());
|
||||
|
||||
// Calculate the points for the two arrow lines
|
||||
double arrowHeadAngle = Math.PI / 6;
|
||||
double x1 = end.getX() - arrowLength * Math.cos(angle - arrowHeadAngle);
|
||||
double y1 = end.getY() - arrowLength * Math.sin(angle - arrowHeadAngle);
|
||||
double x2 = end.getX() - arrowLength * Math.cos(angle + arrowHeadAngle);
|
||||
double y2 = end.getY() - arrowLength * Math.sin(angle + arrowHeadAngle);
|
||||
|
||||
// Create and return the two arrow lines
|
||||
Line2D arrow1 = new Line2D.Double(end, new Point2D.Double(x1, y1));
|
||||
Line2D arrow2 = new Line2D.Double(end, new Point2D.Double(x2, y2));
|
||||
|
||||
return new Line2D[]{arrow1, arrow2};
|
||||
}
|
||||
|
||||
}
|
||||
@ -7,7 +7,6 @@ import java.awt.geom.Rectangle2D;
|
||||
import java.util.Collection;
|
||||
import java.util.List;
|
||||
import java.util.concurrent.atomic.AtomicInteger;
|
||||
import java.util.stream.Stream;
|
||||
|
||||
import org.apache.pdfbox.pdmodel.documentinterchange.markedcontent.PDMarkedContent;
|
||||
|
||||
@ -21,12 +20,10 @@ import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageB
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.MarkedContentUtils;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations;
|
||||
import com.knecon.fforesight.service.viewerdoc.ContentStreams;
|
||||
import com.knecon.fforesight.service.viewerdoc.layers.LayoutDebugLayerConfig;
|
||||
import com.knecon.fforesight.service.viewerdoc.model.ColoredLine;
|
||||
import com.knecon.fforesight.service.viewerdoc.model.ColoredRectangle;
|
||||
import com.knecon.fforesight.service.viewerdoc.model.PlacedText;
|
||||
import com.knecon.fforesight.service.viewerdoc.model.Standard14EmbeddableFont;
|
||||
import com.knecon.fforesight.service.viewerdoc.model.Visualizations;
|
||||
import com.knecon.fforesight.service.viewerdoc.model.VisualizationsOnPage;
|
||||
|
||||
import lombok.AccessLevel;
|
||||
@ -36,72 +33,15 @@ import lombok.NoArgsConstructor;
|
||||
import lombok.Setter;
|
||||
import lombok.experimental.FieldDefaults;
|
||||
|
||||
@Setter
|
||||
@Getter
|
||||
@NoArgsConstructor
|
||||
@AllArgsConstructor
|
||||
@FieldDefaults(level = AccessLevel.PRIVATE)
|
||||
public class LayoutparsingVisualizations {
|
||||
public class LayoutDebugLayer extends LayoutDebugLayerConfig {
|
||||
|
||||
static final Standard14EmbeddableFont FONT = Standard14EmbeddableFont.helvetica();
|
||||
|
||||
static final Color WORDS_COLOR = new Color(68, 84, 147);
|
||||
static final Color LINES_COLOR = new Color(152, 45, 179);
|
||||
static final Color ZONES_COLOR = new Color(131, 38, 38);
|
||||
|
||||
static final Color RULINGS_COLOR = new Color(21, 221, 174);
|
||||
static final Color TABLE_RULINGS_COLOR = new Color(255, 175, 175);
|
||||
static final Color HEADER_RULING_COLOR = new Color(171, 131, 6);
|
||||
static final Color FOOTER_RULING_COLOR = new Color(106, 82, 2);
|
||||
static final Color UNDERLINE_RULING_COLOR = new Color(6, 39, 171);
|
||||
static final Color STRIKETROUGH_RULING_COLOR = new Color(171, 6, 6);
|
||||
|
||||
static final Color CELLS_COLOR = new Color(31, 214, 27);
|
||||
|
||||
static final Color MAIN_BODY_COLOR = new Color(171, 131, 6);
|
||||
static final Color MARKED_CONTENT_COLOR = new Color(171, 131, 6);
|
||||
|
||||
static final List<Color> ROTATING_CHARACTER_COLOR = List.of(new Color(255, 87, 51),
|
||||
new Color(255, 195, 0),
|
||||
new Color(76, 175, 80),
|
||||
new Color(33, 150, 243),
|
||||
new Color(155, 89, 182),
|
||||
new Color(233, 30, 99),
|
||||
new Color(0, 188, 212),
|
||||
new Color(121, 85, 72));
|
||||
|
||||
@Setter
|
||||
boolean active;
|
||||
|
||||
final Visualizations words = Visualizations.builder().layer(ContentStreams.WORDS).build();
|
||||
final Visualizations lines = Visualizations.builder().layer(ContentStreams.LINES).build();
|
||||
final Visualizations zones = Visualizations.builder().layer(ContentStreams.ZONES).build();
|
||||
final Visualizations mainBody = Visualizations.builder().layer(ContentStreams.MAIN_BODY).build();
|
||||
final Visualizations clean_rulings = Visualizations.builder().layer(ContentStreams.CLEAN_RULINGS).build();
|
||||
final Visualizations rulings = Visualizations.builder().layer(ContentStreams.RULINGS).build();
|
||||
final Visualizations cells = Visualizations.builder().layer(ContentStreams.CELLS).build();
|
||||
final Visualizations markedContent = Visualizations.builder().layer(ContentStreams.MARKED_CONTENT).build();
|
||||
final Visualizations neighbours = Visualizations.builder().layer(ContentStreams.NEIGHBOURS).build();
|
||||
final Visualizations characters = Visualizations.builder().layer(ContentStreams.CHARACTERS).build();
|
||||
|
||||
|
||||
public Stream<Visualizations> streamAll() {
|
||||
|
||||
if (!active) {
|
||||
return Stream.empty();
|
||||
}
|
||||
return Stream.of(characters, //
|
||||
neighbours,//
|
||||
words, //
|
||||
lines, //
|
||||
zones, //
|
||||
rulings, //
|
||||
clean_rulings, //
|
||||
cells, //
|
||||
mainBody, //
|
||||
markedContent //
|
||||
);
|
||||
}
|
||||
|
||||
|
||||
public void addTextVisualizations(List<TextPositionSequence> textPositionSequences, int pageNumber) {
|
||||
|
||||
@ -130,6 +70,7 @@ public class LayoutparsingVisualizations {
|
||||
.toList());
|
||||
}
|
||||
|
||||
|
||||
public void addRulingVisualization(List<Ruling> rulings, int pageNumber) {
|
||||
|
||||
if (!active) {
|
||||
@ -137,8 +78,7 @@ public class LayoutparsingVisualizations {
|
||||
}
|
||||
VisualizationsOnPage visualizationsOnPage = getOrCreateVisualizationsOnPage(pageNumber, this.rulings);
|
||||
visualizationsOnPage.getColoredLines()
|
||||
.addAll(rulings
|
||||
.stream()
|
||||
.addAll(rulings.stream()
|
||||
.map(ruling -> new ColoredLine(ruling, decideOnRulingColor(ruling), 0.5f))
|
||||
.toList());
|
||||
}
|
||||
@ -295,16 +235,4 @@ public class LayoutparsingVisualizations {
|
||||
|
||||
}
|
||||
|
||||
|
||||
private VisualizationsOnPage getOrCreateVisualizationsOnPage(int page, Visualizations visualizations) {
|
||||
|
||||
if (visualizations.getVisualizationsOnPages().containsKey(page - 1)) {
|
||||
return visualizations.getVisualizationsOnPages()
|
||||
.get(page - 1);
|
||||
}
|
||||
VisualizationsOnPage visualizationsOnPage = VisualizationsOnPage.builder().build();
|
||||
visualizations.getVisualizationsOnPages().put(page - 1, visualizationsOnPage);
|
||||
return visualizationsOnPage;
|
||||
}
|
||||
|
||||
}
|
||||
@ -0,0 +1,430 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.visualization;
|
||||
|
||||
import java.awt.Color;
|
||||
import java.awt.geom.Line2D;
|
||||
import java.awt.geom.Point2D;
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.awt.geom.RectangularShape;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collections;
|
||||
import java.util.Comparator;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Optional;
|
||||
import java.util.stream.Collectors;
|
||||
import java.util.stream.Stream;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeType;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.DuplicatedParagraph;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Headline;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Image;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Page;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Paragraph;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.SemanticNode;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.SuperSection;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Table;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.TableCell;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations;
|
||||
import com.knecon.fforesight.service.viewerdoc.layers.LayoutGridLayerConfig;
|
||||
import com.knecon.fforesight.service.viewerdoc.model.ColoredLine;
|
||||
import com.knecon.fforesight.service.viewerdoc.model.ColoredRectangle;
|
||||
import com.knecon.fforesight.service.viewerdoc.model.FilledRectangle;
|
||||
import com.knecon.fforesight.service.viewerdoc.model.PlacedText;
|
||||
import com.knecon.fforesight.service.viewerdoc.model.Visualizations;
|
||||
|
||||
import lombok.AccessLevel;
|
||||
import lombok.Getter;
|
||||
import lombok.Setter;
|
||||
import lombok.SneakyThrows;
|
||||
import lombok.experimental.FieldDefaults;
|
||||
|
||||
@FieldDefaults(level = AccessLevel.PRIVATE)
|
||||
public class LayoutGrid extends LayoutGridLayerConfig {
|
||||
|
||||
@Getter
|
||||
@Setter
|
||||
boolean visibleByDefault;
|
||||
|
||||
final HashMap<RectangleIdentifier, Rectangle2D> rectangleMap = new HashMap<>();
|
||||
|
||||
|
||||
public void addParagraph(Paragraph paragraph) {
|
||||
|
||||
if (paragraph instanceof DuplicatedParagraph) {
|
||||
addAsRectangle(paragraph, paragraphs, DUPLICATE_PARAGRAPH_COLOR);
|
||||
} else {
|
||||
addAsRectangle(paragraph, paragraphs, PARAGRAPH_COLOR);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
public void addImage(Image image) {
|
||||
|
||||
if (image.isFullPageImage()) {
|
||||
addAsRectangle(image, images, IMAGE_COLOR);
|
||||
} else {
|
||||
addAsRectangle(image, figures, IMAGE_COLOR);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
public void addHeadline(Headline headline) {
|
||||
|
||||
addAsRectangle(headline, headlines, HEADLINE_COLOR);
|
||||
}
|
||||
|
||||
|
||||
public void addHeaderOrFooter(SemanticNode header) {
|
||||
|
||||
addAsRectangle(header, headerFooter, HEADER_COLOR);
|
||||
}
|
||||
|
||||
|
||||
public void addTreeId(SemanticNode semanticNode) {
|
||||
|
||||
Page page = semanticNode.getFirstPage();
|
||||
addPlacedText(page, semanticNode.getBBox().get(page), semanticNode.getBBox().get(page), buildTreeIdString(semanticNode), 1, treeIds, TREEID_COLOR);
|
||||
}
|
||||
|
||||
|
||||
public void addTable(Table table) {
|
||||
|
||||
addAsRectangle(table, tables, TABLE_COLOR);
|
||||
addInnerTableLines(table);
|
||||
addHeaderCells(table);
|
||||
}
|
||||
|
||||
|
||||
private void addHeaderCells(Table table) {
|
||||
|
||||
table.streamHeaders()
|
||||
.map(TableCell::getBBox)
|
||||
.forEach(map -> map.forEach((page, textBBox) -> getOrCreateVisualizationsOnPage(page.getNumber(), tables).getFilledRectangles()
|
||||
.add(new FilledRectangle(textBBox, HEADER_CELL_COLOR, 0.1f))));
|
||||
}
|
||||
|
||||
|
||||
public void addSection(SemanticNode section) {
|
||||
|
||||
Map<Page, Rectangle2D> bBoxMap = section.getBBox();
|
||||
|
||||
List<SemanticNode> subSections = section.streamAllSubNodesOfType(NodeType.SECTION)
|
||||
.toList();
|
||||
Integer maxChildDepth = subSections.stream()
|
||||
.map(node -> node.getTreeId().size())
|
||||
.max(Integer::compareTo).orElse(section.getTreeId().size());
|
||||
int ownDepth = section.getTreeId().size();
|
||||
|
||||
Page firstPage = section.getFirstPage();
|
||||
String treeIdString = buildTreeIdString(section);
|
||||
|
||||
if (bBoxMap.values().size() == 1) {
|
||||
handleSinglePage(section, firstPage, bBoxMap.get(firstPage), treeIdString, maxChildDepth, ownDepth);
|
||||
return;
|
||||
}
|
||||
List<Page> pagesInOrder = bBoxMap.keySet()
|
||||
.stream()
|
||||
.sorted(Comparator.comparingInt(Page::getNumber))
|
||||
.collect(Collectors.toList());
|
||||
pagesInOrder.remove(0);
|
||||
handleFirstPageOfSection(section, firstPage, bBoxMap.get(firstPage), treeIdString, maxChildDepth, ownDepth);
|
||||
if (section instanceof SuperSection) {
|
||||
return;
|
||||
}
|
||||
for (Page middlePage : pagesInOrder.subList(0, pagesInOrder.size() - 1)) {
|
||||
handleForMiddlePageOfSection(section, middlePage, bBoxMap.get(middlePage), treeIdString, maxChildDepth, ownDepth);
|
||||
}
|
||||
var lastPage = pagesInOrder.remove(pagesInOrder.size() - 1);
|
||||
handleLastPageOfSection(section, lastPage, bBoxMap.get(lastPage), treeIdString, maxChildDepth, ownDepth);
|
||||
}
|
||||
|
||||
|
||||
private String buildTreeIdString(SemanticNode semanticNode) {
|
||||
|
||||
return semanticNode.getTreeId()
|
||||
.stream()
|
||||
.map(Object::toString)
|
||||
.collect(Collectors.joining("."));
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
private void addPlacedText(Page page, Rectangle2D textBBox, Rectangle2D highestParentRect, String s, Integer maxChildDepth, Visualizations visualizations, Color color) {
|
||||
|
||||
// translates text, such that its right edge is a bit to the left of the drawn box
|
||||
float translationAmount = ((FONT.getStringWidth(s) / 1000) * FONT_SIZE + LINE_WIDTH + 2 * maxChildDepth);
|
||||
|
||||
Point2D upperLeftCorner;
|
||||
Point2D translationVector;
|
||||
switch (page.getRotation()) {
|
||||
case 90 -> {
|
||||
if (highestParentRect != null) {
|
||||
upperLeftCorner = new Point2D.Double(highestParentRect.getMinX(), textBBox.getMinY());
|
||||
} else {
|
||||
upperLeftCorner = new Point2D.Double(textBBox.getMinX(), textBBox.getMinY());
|
||||
}
|
||||
translationVector = new Point2D.Double(FONT_SIZE, -translationAmount);
|
||||
}
|
||||
case 180 -> {
|
||||
if (highestParentRect != null) {
|
||||
upperLeftCorner = new Point2D.Double(highestParentRect.getMaxX(), textBBox.getMinY());
|
||||
} else {
|
||||
upperLeftCorner = new Point2D.Double(textBBox.getMaxX(), textBBox.getMinY());
|
||||
}
|
||||
translationVector = new Point2D.Double(translationAmount, FONT_SIZE);
|
||||
}
|
||||
case 270 -> {
|
||||
|
||||
if (highestParentRect != null) {
|
||||
upperLeftCorner = new Point2D.Double(highestParentRect.getMaxX(), textBBox.getMaxY());
|
||||
} else {
|
||||
upperLeftCorner = new Point2D.Double(textBBox.getMaxX(), textBBox.getMaxY());
|
||||
}
|
||||
translationVector = new Point2D.Double(-FONT_SIZE, translationAmount);
|
||||
}
|
||||
default -> {
|
||||
|
||||
if (highestParentRect != null) {
|
||||
upperLeftCorner = new Point2D.Double(highestParentRect.getMinX(), textBBox.getMaxY());
|
||||
} else {
|
||||
upperLeftCorner = new Point2D.Double(textBBox.getMinX(), textBBox.getMaxY());
|
||||
}
|
||||
translationVector = new Point2D.Double(-translationAmount, -FONT_SIZE);
|
||||
}
|
||||
}
|
||||
|
||||
upperLeftCorner = add(upperLeftCorner, translationVector);
|
||||
|
||||
List<PlacedText> placedTexts = getOrCreateVisualizationsOnPage(page.getNumber(), visualizations).getPlacedTexts();
|
||||
|
||||
PlacedText newText = PlacedText.textFacingUp(s, upperLeftCorner, FONT_SIZE, color, FONT);
|
||||
|
||||
Optional<PlacedText> conflictingText = placedTexts.stream()
|
||||
.filter(pt -> Math.abs(pt.lineStart().getY() - newText.lineStart().getY()) <= FONT_SIZE)
|
||||
.findFirst();
|
||||
|
||||
if (conflictingText.isPresent()) {
|
||||
PlacedText existingText = conflictingText.get();
|
||||
if (newText.text().length() > existingText.text().length()) {
|
||||
placedTexts.remove(existingText);
|
||||
placedTexts.add(newText);
|
||||
}
|
||||
} else {
|
||||
placedTexts.add(newText);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
private void handleSinglePage(SemanticNode semanticNode, Page page, Rectangle2D rectangle2D, String treeIdString, Integer maxChildDepth, Integer ownDepth) {
|
||||
|
||||
RectangleAndLinesResult result = createLinesAndPlaceText(semanticNode, page, rectangle2D, treeIdString, maxChildDepth, ownDepth);
|
||||
// add string to top line
|
||||
var firstLine = result.pageLines().remove(0);
|
||||
result.coloredLines().add(new ColoredLine(firstLine, SECTION_COLOR, LINE_WIDTH));
|
||||
for (Line2D line : result.pageLines()) {
|
||||
result.coloredLines().add(new ColoredLine(line, SECTION_COLOR, LINE_WIDTH));
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
private void handleFirstPageOfSection(SemanticNode semanticNode, Page firstPage, Rectangle2D rectangle2D, String treeIdString, Integer maxChildDepth, Integer ownDepth) {
|
||||
|
||||
RectangleAndLinesResult result = createLinesAndPlaceText(semanticNode, firstPage, rectangle2D, treeIdString, maxChildDepth, ownDepth);
|
||||
// remove bottom line
|
||||
result.pageLines().remove(2);
|
||||
// add string to top line
|
||||
var firstLine = result.pageLines().remove(0);
|
||||
result.coloredLines().add(new ColoredLine(firstLine, SECTION_COLOR, LINE_WIDTH));
|
||||
for (Line2D line : result.pageLines()) {
|
||||
result.coloredLines().add(new ColoredLine(line, SECTION_COLOR, LINE_WIDTH));
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
private void handleForMiddlePageOfSection(SemanticNode semanticNode, Page middlePage, Rectangle2D rectangle2D, String treeIdString, Integer maxChildDepth, Integer ownDepth) {
|
||||
|
||||
RectangleAndLinesResult result = createLinesAndPlaceText(semanticNode, middlePage, rectangle2D, treeIdString, maxChildDepth, ownDepth);
|
||||
// remove top line
|
||||
result.pageLines().remove(0);
|
||||
// remove bottom line
|
||||
result.pageLines().remove(1);
|
||||
// add string to left line
|
||||
var leftLine = result.pageLines().remove(1);
|
||||
result.coloredLines().add(new ColoredLine(leftLine, SECTION_COLOR, LINE_WIDTH));
|
||||
for (Line2D line : result.pageLines()) {
|
||||
result.coloredLines().add(new ColoredLine(line, SECTION_COLOR, LINE_WIDTH));
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
private void handleLastPageOfSection(SemanticNode semanticNode, Page lastPage, Rectangle2D rectangle2D, String treeIdString, Integer maxChildDepth, Integer ownDepth) {
|
||||
|
||||
RectangleAndLinesResult result = createLinesAndPlaceText(semanticNode, lastPage, rectangle2D, treeIdString, maxChildDepth, ownDepth);
|
||||
// remove top line
|
||||
result.pageLines().remove(0);
|
||||
// add string to left line
|
||||
var leftLine = result.pageLines().remove(2);
|
||||
result.coloredLines().add(new ColoredLine(leftLine, SECTION_COLOR, LINE_WIDTH));
|
||||
for (Line2D line : result.pageLines()) {
|
||||
result.coloredLines().add(new ColoredLine(line, SECTION_COLOR, LINE_WIDTH));
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
private RectangleAndLinesResult createLinesAndPlaceText(SemanticNode semanticNode,
|
||||
Page page,
|
||||
Rectangle2D rectangle2D,
|
||||
String treeIdString,
|
||||
Integer maxChildDepth,
|
||||
Integer ownDepth) {
|
||||
|
||||
List<ColoredLine> coloredLines = getOrCreateVisualizationsOnPage(page.getNumber(), sections).getColoredLines();
|
||||
int lineWidthModifier = maxChildDepth - ownDepth;
|
||||
Rectangle2D r = RectangleTransformations.pad(semanticNode.getBBox().get(page), LINE_WIDTH * (1 + lineWidthModifier), LINE_WIDTH * (1 + lineWidthModifier));
|
||||
|
||||
SemanticNode highestParent = semanticNode.getHighestParent();
|
||||
Rectangle2D highestParentRect = rectangleMap.get(new RectangleIdentifier(highestParent.getTreeId(), page.getNumber()));
|
||||
addPlacedText(page, rectangle2D, highestParentRect, treeIdString, maxChildDepth, sections, SECTION_COLOR);
|
||||
var lastPageLines = createLinesFromRectangle(r, page.getRotation());
|
||||
|
||||
if (semanticNode instanceof SuperSection) {
|
||||
rectangleMap.put(new RectangleIdentifier(semanticNode.getTreeId(), page.getNumber()), r);
|
||||
}
|
||||
|
||||
return new RectangleAndLinesResult(coloredLines, r, lastPageLines);
|
||||
}
|
||||
|
||||
|
||||
private void addInnerTableLines(Table table) {
|
||||
|
||||
if (table.getNumberOfCols() < 1 || table.getNumberOfRows() < 1) {
|
||||
return;
|
||||
}
|
||||
for (Page page : table.getPages()) {
|
||||
|
||||
Optional<Integer> optionalFirstRowOnPage = table.streamCol(0)
|
||||
.filter(tableCell -> tableCell.isOnPage(page.getNumber()))
|
||||
.map(TableCell::getRow)
|
||||
.findFirst();
|
||||
if (optionalFirstRowOnPage.isEmpty()) {
|
||||
continue;
|
||||
}
|
||||
int firstRowOnPage = optionalFirstRowOnPage.get();
|
||||
|
||||
Stream<Double> xStream = switch (page.getRotation()) {
|
||||
case 90 -> streamBBoxOfCellsOnPage(table.streamCol(0), page).map(RectangularShape::getMinX);
|
||||
case 180 -> streamBBoxOfCellsOnPage(table.streamRow(firstRowOnPage), page).map(RectangularShape::getMaxX);
|
||||
case 270 -> streamBBoxOfCellsOnPage(table.streamCol(0), page).map(RectangularShape::getMaxX);
|
||||
default -> streamBBoxOfCellsOnPage(table.streamRow(firstRowOnPage), page).map(RectangularShape::getMinX);
|
||||
};
|
||||
List<Double> xs = xStream.collect(Collectors.toList());
|
||||
xs.remove(0);
|
||||
|
||||
Stream<Double> yStream = switch (page.getRotation()) {
|
||||
case 90 -> streamBBoxOfCellsOnPage(table.streamRow(firstRowOnPage), page).map(RectangularShape::getMinY);
|
||||
case 180 -> streamBBoxOfCellsOnPage(table.streamCol(0), page).map(RectangularShape::getMinY);
|
||||
case 270 -> streamBBoxOfCellsOnPage(table.streamRow(firstRowOnPage), page).map(RectangularShape::getMaxY);
|
||||
default -> streamBBoxOfCellsOnPage(table.streamCol(0), page).map(RectangularShape::getMaxY);
|
||||
};
|
||||
List<Double> ys = yStream.collect(Collectors.toList());
|
||||
ys.remove(0);
|
||||
|
||||
Rectangle2D tableBBox = table.getBBox().get(page);
|
||||
List<ColoredLine> coloredLines = getOrCreateVisualizationsOnPage(page.getNumber(), tables).getColoredLines();
|
||||
|
||||
xs.forEach(x -> {
|
||||
Line2D line = new Line2D.Double(new Point2D.Double(x, tableBBox.getMaxY()), new Point2D.Double(x, tableBBox.getMinY()));
|
||||
coloredLines.add(new ColoredLine(line, INNER_LINES_COLOR, LINE_WIDTH));
|
||||
});
|
||||
ys.forEach(y -> {
|
||||
Line2D line = new Line2D.Double(new Point2D.Double(tableBBox.getMinX(), y), new Point2D.Double(tableBBox.getMaxX(), y));
|
||||
coloredLines.add(new ColoredLine(line, INNER_LINES_COLOR, LINE_WIDTH));
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
private static Stream<Rectangle2D> streamBBoxOfCellsOnPage(Stream<TableCell> table, Page page) {
|
||||
|
||||
return table.filter(tableCell -> tableCell.isOnPage(page.getNumber()))
|
||||
.map(TableCell::getBBox)
|
||||
.map(bBoxMap -> bBoxMap.get(page));
|
||||
}
|
||||
|
||||
|
||||
private void addAsRectangle(SemanticNode semanticNode, Visualizations visualizations, Color color) {
|
||||
|
||||
addAsRectangle(semanticNode.getBBox(), visualizations, color);
|
||||
}
|
||||
|
||||
|
||||
private void addAsRectangle(Map<Page, Rectangle2D> bbox, Visualizations visualizations, Color color) {
|
||||
|
||||
bbox.forEach((page, textBBox) -> getOrCreateVisualizationsOnPage(page.getNumber(), visualizations).getColoredRectangles()
|
||||
.add(new ColoredRectangle(textBBox, color, LINE_WIDTH)));
|
||||
}
|
||||
|
||||
|
||||
private record RectangleAndLinesResult(List<ColoredLine> coloredLines, Rectangle2D rectangle, List<Line2D> pageLines) {
|
||||
|
||||
}
|
||||
|
||||
private record RectangleIdentifier(List<Integer> treeId, Integer pageNumber) {
|
||||
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
A __________________ B
|
||||
| |
|
||||
| |
|
||||
| |
|
||||
| |
|
||||
D|__________________| C
|
||||
The returned List are the lines [AB, BC, DC, AD]
|
||||
The List is reordered, such that the order of the returned lines are always as viewed on the page.
|
||||
*/
|
||||
private List<Line2D> createLinesFromRectangle(Rectangle2D r, int pageRotation) {
|
||||
// +0.5 to join the lines
|
||||
List<Line2D> lines = new ArrayList<>(4);
|
||||
float lineWidthCorrection = LINE_WIDTH * 0.5f;
|
||||
Point2D.Float a = new Point2D.Float((float) r.getMinX(), (float) r.getMaxY());
|
||||
Point2D.Float a1 = new Point2D.Float((float) r.getMinX() - lineWidthCorrection, (float) r.getMaxY());
|
||||
Point2D.Float b = new Point2D.Float((float) r.getMaxX(), (float) r.getMaxY());
|
||||
Point2D.Float b1 = new Point2D.Float((float) r.getMaxX() + lineWidthCorrection, (float) r.getMaxY());
|
||||
Point2D.Float c = new Point2D.Float((float) r.getMaxX(), (float) r.getMinY());
|
||||
Point2D.Float c1 = new Point2D.Float((float) r.getMaxX() + lineWidthCorrection, (float) r.getMinY());
|
||||
Point2D.Float d = new Point2D.Float((float) r.getMinX(), (float) r.getMinY());
|
||||
Point2D.Float d1 = new Point2D.Float((float) r.getMinX() - lineWidthCorrection, (float) r.getMinY());
|
||||
lines.add(new Line2D.Float(a1, b1));
|
||||
lines.add(new Line2D.Float(b, c));
|
||||
lines.add(new Line2D.Float(d1, c1));
|
||||
lines.add(new Line2D.Float(a, d));
|
||||
|
||||
return switch (pageRotation) {
|
||||
case 90 -> {
|
||||
Collections.rotate(lines, 1);
|
||||
yield lines;
|
||||
}
|
||||
case 180 -> {
|
||||
Collections.rotate(lines, 2);
|
||||
yield lines;
|
||||
}
|
||||
case 270 -> {
|
||||
Collections.rotate(lines, 3);
|
||||
yield lines;
|
||||
}
|
||||
|
||||
default -> lines;
|
||||
};
|
||||
}
|
||||
|
||||
|
||||
private Point2D add(Point2D a, Point2D b) {
|
||||
|
||||
return new Point2D.Double(a.getX() + b.getX(), a.getY() + b.getY());
|
||||
}
|
||||
|
||||
}
|
||||
@ -38,7 +38,7 @@ dependencies {
|
||||
implementation("com.amazonaws:aws-java-sdk-s3:1.12.536")
|
||||
implementation("org.springframework.cloud:spring-cloud-starter-openfeign:4.0.4")
|
||||
implementation("net.logstash.logback:logstash-logback-encoder:7.4")
|
||||
implementation("com.pdftron:PDFNet:10.5.0")
|
||||
implementation("com.pdftron:PDFNet:10.7.0")
|
||||
|
||||
// for integration testing only
|
||||
testImplementation(project(":viewer-doc-processor"))
|
||||
|
||||
@ -1,5 +1,6 @@
|
||||
package com.knecon.fforesight.service.layoutparser.server;
|
||||
|
||||
import org.springframework.beans.factory.annotation.Value;
|
||||
import org.springframework.stereotype.Component;
|
||||
|
||||
import com.google.common.base.Strings;
|
||||
@ -17,7 +18,8 @@ import lombok.extern.slf4j.Slf4j;
|
||||
@RequiredArgsConstructor
|
||||
public class PDFNetInitializer {
|
||||
|
||||
private final LayoutparserSettings settings;
|
||||
@Value("${pdftron.license:}")
|
||||
private String pdftronLicense;
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
@ -25,13 +27,13 @@ public class PDFNetInitializer {
|
||||
// Do not change back to application runner, if it is application runner it takes messages from the queue before PDFNet is initialized, that leads to UnsatisfiedLinkError.
|
||||
public void init() {
|
||||
|
||||
if (Strings.isNullOrEmpty(settings.getPdftronLicense())) {
|
||||
if (Strings.isNullOrEmpty(pdftronLicense)) {
|
||||
return;
|
||||
}
|
||||
log.info("Initializing Native Libraries");
|
||||
log.info("Setting pdftron license: {}", settings.getPdftronLicense());
|
||||
log.info("Setting pdftron license: {}", pdftronLicense);
|
||||
PDFNet.setTempPath("/tmp/pdftron");
|
||||
PDFNet.initialize(settings.getPdftronLicense());
|
||||
PDFNet.initialize(pdftronLicense);
|
||||
|
||||
}
|
||||
|
||||
|
||||
@ -50,7 +50,7 @@ public class BdrJsonBuildTest extends AbstractTest {
|
||||
protected Document buildGraph(File file) {
|
||||
|
||||
return DocumentGraphFactory.buildDocumentGraph(LayoutParsingType.CLARIFYND,
|
||||
layoutParsingPipeline.parseLayout(LayoutParsingType.CLARIFYND,
|
||||
layoutParsingPipeline.parseLayout(LayoutParsingType.CLARIFYND,
|
||||
file,
|
||||
new ImageServiceResponse(),
|
||||
new TableServiceResponse(),
|
||||
|
||||
@ -97,7 +97,7 @@ public class HeadlinesGoldStandardIntegrationTest {
|
||||
goldStandardLog.getRedactionLogEntry().forEach(e -> goldStandardHeadlines.add(new Headline(e.getPositions().get(0).getPage(), e.getValue())));
|
||||
|
||||
Document documentGraph = DocumentGraphFactory.buildDocumentGraph(LayoutParsingType.DOCUMINE,
|
||||
layoutParsingPipeline.parseLayout(LayoutParsingType.DOCUMINE,
|
||||
layoutParsingPipeline.parseLayout(LayoutParsingType.DOCUMINE,
|
||||
pdfFileResource.getFile(),
|
||||
new ImageServiceResponse(),
|
||||
new TableServiceResponse(),
|
||||
|
||||
@ -12,6 +12,7 @@ import java.util.Map;
|
||||
import java.util.function.Predicate;
|
||||
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.junit.jupiter.api.BeforeEach;
|
||||
import org.junit.jupiter.api.Test;
|
||||
import org.springframework.beans.factory.annotation.Autowired;
|
||||
import org.springframework.core.io.ClassPathResource;
|
||||
@ -32,18 +33,29 @@ import com.knecon.fforesight.service.layoutparser.processor.python_api.model.tab
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.factory.DocumentGraphFactory;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.visualization.LayoutGridService;
|
||||
import com.knecon.fforesight.service.layoutparser.server.utils.AbstractTest;
|
||||
import com.knecon.fforesight.service.viewerdoc.service.ViewerDocumentService;
|
||||
import com.knecon.fforesight.service.viewerdoc.service.PDFTronViewerDocumentService;
|
||||
|
||||
import jakarta.annotation.PostConstruct;
|
||||
import lombok.SneakyThrows;
|
||||
|
||||
public class OutlineDetectionTest extends AbstractTest {
|
||||
|
||||
ViewerDocumentService viewerDocumentService = new ViewerDocumentService(null);
|
||||
PDFTronViewerDocumentService viewerDocumentService = new PDFTronViewerDocumentService(null);
|
||||
LayoutGridService layoutGridService = new LayoutGridService(viewerDocumentService);
|
||||
|
||||
@Autowired
|
||||
protected LayoutParsingPipeline layoutParsingPipeline;
|
||||
|
||||
@Autowired
|
||||
PDFNetInitializer pdfNetInitializer;
|
||||
|
||||
|
||||
@BeforeEach
|
||||
public void init() {
|
||||
|
||||
pdfNetInitializer.init();
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
@SneakyThrows
|
||||
@ -60,28 +72,17 @@ public class OutlineDetectionTest extends AbstractTest {
|
||||
|
||||
OutlineObjectTree outlineObjectTree = classificationDocument.getOutlineObjectTree();
|
||||
assertEquals(outlineObjectTree.getRootNodes().size(), 8);
|
||||
assertEquals(outlineObjectTree.getOutlineObjectsPerPage()
|
||||
.get(1).size(), 1);
|
||||
assertEquals(outlineObjectTree.getOutlineObjectsPerPage()
|
||||
.get(3).size(), 1);
|
||||
assertEquals(outlineObjectTree.getOutlineObjectsPerPage()
|
||||
.get(4).size(), 1);
|
||||
assertEquals(outlineObjectTree.getOutlineObjectsPerPage()
|
||||
.get(5).size(), 1);
|
||||
assertEquals(outlineObjectTree.getOutlineObjectsPerPage()
|
||||
.get(6).size(), 2);
|
||||
assertEquals(outlineObjectTree.getOutlineObjectsPerPage()
|
||||
.get(7).size(), 3);
|
||||
assertEquals(outlineObjectTree.getOutlineObjectsPerPage()
|
||||
.get(8).size(), 2);
|
||||
assertEquals(outlineObjectTree.getOutlineObjectsPerPage()
|
||||
.get(10).size(), 1);
|
||||
assertEquals(outlineObjectTree.getOutlineObjectsPerPage()
|
||||
.get(11).size(), 4);
|
||||
assertEquals(outlineObjectTree.getOutlineObjectsPerPage()
|
||||
.get(12).size(), 1);
|
||||
assertEquals(outlineObjectTree.getOutlineObjectsPerPage()
|
||||
.get(13).size(), 2);
|
||||
assertEquals(outlineObjectTree.getOutlineObjectsPerPage().get(1).size(), 1);
|
||||
assertEquals(outlineObjectTree.getOutlineObjectsPerPage().get(3).size(), 1);
|
||||
assertEquals(outlineObjectTree.getOutlineObjectsPerPage().get(4).size(), 1);
|
||||
assertEquals(outlineObjectTree.getOutlineObjectsPerPage().get(5).size(), 1);
|
||||
assertEquals(outlineObjectTree.getOutlineObjectsPerPage().get(6).size(), 2);
|
||||
assertEquals(outlineObjectTree.getOutlineObjectsPerPage().get(7).size(), 3);
|
||||
assertEquals(outlineObjectTree.getOutlineObjectsPerPage().get(8).size(), 2);
|
||||
assertEquals(outlineObjectTree.getOutlineObjectsPerPage().get(10).size(), 1);
|
||||
assertEquals(outlineObjectTree.getOutlineObjectsPerPage().get(11).size(), 4);
|
||||
assertEquals(outlineObjectTree.getOutlineObjectsPerPage().get(12).size(), 1);
|
||||
assertEquals(outlineObjectTree.getOutlineObjectsPerPage().get(13).size(), 2);
|
||||
assertTrue(outlineObjectTree.getOutlineObjectsPerPage().values()
|
||||
.stream()
|
||||
.flatMap(Collection::stream)
|
||||
@ -98,29 +99,15 @@ public class OutlineDetectionTest extends AbstractTest {
|
||||
.stream()
|
||||
.map(outlineObjectTreeNode -> sanitizeString(outlineObjectTreeNode.getOutlineObject().getTitle()))
|
||||
.toList());
|
||||
assertEquals(tableOfContents.getMainSections()
|
||||
.get(5).getChildren().size(), 6);
|
||||
assertEquals(tableOfContents.getMainSections()
|
||||
.get(7).getChildren().size(), 3);
|
||||
assertEquals(tableOfContents.getMainSections()
|
||||
.get(8).getChildren().size(), 3);
|
||||
assertEquals(tableOfContents.getMainSections()
|
||||
.get(8).getChildren()
|
||||
.get(2).getChildren().size(), 1);
|
||||
assertEquals(tableOfContents.getMainSections()
|
||||
.get(8).getChildren()
|
||||
.get(2).getChildren()
|
||||
.get(0).getChildren().size(), 3);
|
||||
assertEquals(tableOfContents.getMainSections().get(5).getChildren().size(), 6);
|
||||
assertEquals(tableOfContents.getMainSections().get(7).getChildren().size(), 3);
|
||||
assertEquals(tableOfContents.getMainSections().get(8).getChildren().size(), 3);
|
||||
assertEquals(tableOfContents.getMainSections().get(8).getChildren().get(2).getChildren().size(), 1);
|
||||
assertEquals(tableOfContents.getMainSections().get(8).getChildren().get(2).getChildren().get(0).getChildren().size(), 3);
|
||||
|
||||
assertEquals(tableOfContents.getMainSections()
|
||||
.get(0).getImages().size(), 1);
|
||||
assertEquals(tableOfContents.getMainSections()
|
||||
.get(6).getImages().size(), 1);
|
||||
assertEquals(tableOfContents.getMainSections()
|
||||
.get(8).getChildren()
|
||||
.get(2).getChildren()
|
||||
.get(0).getChildren()
|
||||
.get(2).getImages().size(), 1);
|
||||
assertEquals(tableOfContents.getMainSections().get(0).getImages().size(), 1);
|
||||
assertEquals(tableOfContents.getMainSections().get(6).getImages().size(), 1);
|
||||
assertEquals(tableOfContents.getMainSections().get(8).getChildren().get(2).getChildren().get(0).getChildren().get(2).getImages().size(), 1);
|
||||
|
||||
Document document = buildGraph(fileName, classificationDocument);
|
||||
|
||||
@ -159,17 +146,14 @@ public class OutlineDetectionTest extends AbstractTest {
|
||||
.count(), 3 + 1);
|
||||
assertEquals(childrenOfTypeSectionOrSuperSection.get(8).streamChildren()
|
||||
.filter(isSectionOrSuperSection)
|
||||
.toList()
|
||||
.get(3).streamChildren()
|
||||
.toList().get(3).streamChildren()
|
||||
.filter(isSectionOrSuperSection)
|
||||
.count(), 1 + 1);
|
||||
assertEquals(childrenOfTypeSectionOrSuperSection.get(8).streamChildren()
|
||||
.filter(isSectionOrSuperSection)
|
||||
.toList()
|
||||
.get(3).streamChildren()
|
||||
.toList().get(3).streamChildren()
|
||||
.filter(isSectionOrSuperSection)
|
||||
.toList()
|
||||
.get(1).streamChildren()
|
||||
.toList().get(1).streamChildren()
|
||||
.filter(isSectionOrSuperSection)
|
||||
.count(), 3 + 1);
|
||||
|
||||
|
||||
@ -3,11 +3,8 @@ package com.knecon.fforesight.service.layoutparser.server;
|
||||
import static org.assertj.core.api.AssertionsForClassTypes.assertThat;
|
||||
|
||||
import java.io.File;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.concurrent.atomic.AtomicReference;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import org.junit.jupiter.api.Test;
|
||||
@ -16,12 +13,8 @@ import org.springframework.core.io.ClassPathResource;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.SimplifiedSectionText;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.SimplifiedText;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingFinishedEvent;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingRequest;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.LayoutParsingPipeline;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.LayoutParsingStorageService;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationDocument;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.image.ImageServiceResponse;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableServiceResponse;
|
||||
@ -30,7 +23,6 @@ import com.knecon.fforesight.service.layoutparser.processor.services.SimplifiedS
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.factory.DocumentGraphFactory;
|
||||
import com.knecon.fforesight.service.layoutparser.server.utils.AbstractTest;
|
||||
|
||||
import io.micrometer.observation.Observation;
|
||||
import lombok.SneakyThrows;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
|
||||
|
||||
@ -1,71 +0,0 @@
|
||||
package com.knecon.fforesight.service.layoutparser.server.graph;
|
||||
|
||||
import static org.junit.jupiter.api.Assertions.assertEquals;
|
||||
import static org.junit.jupiter.api.Assertions.assertFalse;
|
||||
import static org.junit.jupiter.api.Assertions.assertThrows;
|
||||
import static org.junit.jupiter.api.Assertions.assertTrue;
|
||||
|
||||
import java.util.Collections;
|
||||
import java.util.List;
|
||||
|
||||
import org.junit.jupiter.api.BeforeEach;
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.Boundary;
|
||||
|
||||
class BoundaryTest {
|
||||
|
||||
Boundary startBoundary;
|
||||
|
||||
|
||||
@BeforeEach
|
||||
void setUp() {
|
||||
|
||||
startBoundary = new Boundary(10, 100);
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
void testContains() {
|
||||
|
||||
assertTrue(startBoundary.contains(11));
|
||||
assertTrue(startBoundary.contains(50));
|
||||
assertFalse(startBoundary.contains(9));
|
||||
assertFalse(startBoundary.contains(100));
|
||||
assertFalse(startBoundary.contains(150));
|
||||
assertFalse(startBoundary.contains(-123));
|
||||
assertTrue(startBoundary.contains(new Boundary(11, 99)));
|
||||
assertTrue(startBoundary.contains(new Boundary(10, 100)));
|
||||
assertTrue(startBoundary.contains(new Boundary(11, 11)));
|
||||
assertFalse(startBoundary.contains(9, 100));
|
||||
assertTrue(startBoundary.contains(100, 100));
|
||||
assertFalse(startBoundary.contains(100, 101));
|
||||
assertFalse(startBoundary.contains(150, 151));
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
void testIntersects() {
|
||||
|
||||
assertTrue(startBoundary.intersects(new Boundary(1, 11)));
|
||||
assertTrue(startBoundary.intersects(new Boundary(11, 12)));
|
||||
assertTrue(startBoundary.intersects(new Boundary(11, 100)));
|
||||
assertFalse(startBoundary.intersects(new Boundary(100, 101)));
|
||||
assertTrue(startBoundary.intersects(new Boundary(99, 101)));
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
void testSplit() {
|
||||
|
||||
assertEquals(4, startBoundary.split(List.of(12, 40, 90)).size());
|
||||
assertEquals(List.of(new Boundary(10, 12), new Boundary(12, 40), new Boundary(40, 90), new Boundary(90, 100)), startBoundary.split(List.of(12, 40, 90)));
|
||||
assertEquals(List.of(new Boundary(10, 40), new Boundary(40, 100)), startBoundary.split(List.of(40)));
|
||||
assertEquals(1, startBoundary.split(Collections.emptyList()).size());
|
||||
assertEquals(1, startBoundary.split(List.of(startBoundary.start())).size());
|
||||
assertThrows(IndexOutOfBoundsException.class, () -> startBoundary.split(Collections.singletonList(0)));
|
||||
assertThrows(IndexOutOfBoundsException.class, () -> startBoundary.split(Collections.singletonList(100)));
|
||||
assertThrows(IndexOutOfBoundsException.class, () -> startBoundary.split(List.of(12, 40, 100)));
|
||||
}
|
||||
|
||||
}
|
||||
@ -57,7 +57,7 @@ public class DocumentGraphJsonWritingTest extends BuildDocumentTest {
|
||||
private void writeJsons(Path filename) {
|
||||
|
||||
Document documentGraph = DocumentGraphFactory.buildDocumentGraph(LayoutParsingType.REDACT_MANAGER_WITHOUT_DUPLICATE_PARAGRAPH,
|
||||
layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER_WITHOUT_DUPLICATE_PARAGRAPH,
|
||||
layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER_WITHOUT_DUPLICATE_PARAGRAPH,
|
||||
filename.toFile(),
|
||||
new ImageServiceResponse(),
|
||||
new TableServiceResponse(),
|
||||
|
||||
@ -0,0 +1,71 @@
|
||||
package com.knecon.fforesight.service.layoutparser.server.graph;
|
||||
|
||||
import static org.junit.jupiter.api.Assertions.assertEquals;
|
||||
import static org.junit.jupiter.api.Assertions.assertFalse;
|
||||
import static org.junit.jupiter.api.Assertions.assertThrows;
|
||||
import static org.junit.jupiter.api.Assertions.assertTrue;
|
||||
|
||||
import java.util.Collections;
|
||||
import java.util.List;
|
||||
|
||||
import org.junit.jupiter.api.BeforeEach;
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.TextRange;
|
||||
|
||||
class TextRangeTest {
|
||||
|
||||
TextRange startTextRange;
|
||||
|
||||
|
||||
@BeforeEach
|
||||
void setUp() {
|
||||
|
||||
startTextRange = new TextRange(10, 100);
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
void testContains() {
|
||||
|
||||
assertTrue(startTextRange.contains(11));
|
||||
assertTrue(startTextRange.contains(50));
|
||||
assertFalse(startTextRange.contains(9));
|
||||
assertFalse(startTextRange.contains(100));
|
||||
assertFalse(startTextRange.contains(150));
|
||||
assertFalse(startTextRange.contains(-123));
|
||||
assertTrue(startTextRange.contains(new TextRange(11, 99)));
|
||||
assertTrue(startTextRange.contains(new TextRange(10, 100)));
|
||||
assertTrue(startTextRange.contains(new TextRange(11, 11)));
|
||||
assertFalse(startTextRange.contains(9, 100));
|
||||
assertTrue(startTextRange.contains(100, 100));
|
||||
assertFalse(startTextRange.contains(100, 101));
|
||||
assertFalse(startTextRange.contains(150, 151));
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
void testIntersects() {
|
||||
|
||||
assertTrue(startTextRange.intersects(new TextRange(1, 11)));
|
||||
assertTrue(startTextRange.intersects(new TextRange(11, 12)));
|
||||
assertTrue(startTextRange.intersects(new TextRange(11, 100)));
|
||||
assertFalse(startTextRange.intersects(new TextRange(100, 101)));
|
||||
assertTrue(startTextRange.intersects(new TextRange(99, 101)));
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
void testSplit() {
|
||||
|
||||
assertEquals(4, startTextRange.split(List.of(12, 40, 90)).size());
|
||||
assertEquals(List.of(new TextRange(10, 12), new TextRange(12, 40), new TextRange(40, 90), new TextRange(90, 100)), startTextRange.split(List.of(12, 40, 90)));
|
||||
assertEquals(List.of(new TextRange(10, 40), new TextRange(40, 100)), startTextRange.split(List.of(40)));
|
||||
assertEquals(1, startTextRange.split(Collections.emptyList()).size());
|
||||
assertEquals(1, startTextRange.split(List.of(startTextRange.start())).size());
|
||||
assertThrows(IndexOutOfBoundsException.class, () -> startTextRange.split(Collections.singletonList(0)));
|
||||
assertThrows(IndexOutOfBoundsException.class, () -> startTextRange.split(Collections.singletonList(100)));
|
||||
assertThrows(IndexOutOfBoundsException.class, () -> startTextRange.split(List.of(12, 40, 100)));
|
||||
}
|
||||
|
||||
}
|
||||
@ -4,11 +4,18 @@ import java.io.File;
|
||||
import java.nio.file.Path;
|
||||
import java.util.Map;
|
||||
|
||||
import org.junit.jupiter.api.AfterAll;
|
||||
import org.junit.jupiter.api.BeforeEach;
|
||||
import org.junit.jupiter.api.Disabled;
|
||||
import org.junit.jupiter.api.Test;
|
||||
import org.mockito.MockitoAnnotations;
|
||||
import org.springframework.amqp.rabbit.core.RabbitTemplate;
|
||||
import org.springframework.beans.factory.annotation.Autowired;
|
||||
import org.springframework.boot.test.mock.mockito.MockBean;
|
||||
import org.springframework.core.io.ClassPathResource;
|
||||
|
||||
import com.iqser.red.commons.jackson.ObjectMapperFactory;
|
||||
import com.iqser.red.storage.commons.service.StorageService;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.image.ImageServiceResponse;
|
||||
@ -16,17 +23,30 @@ import com.knecon.fforesight.service.layoutparser.processor.python_api.model.tab
|
||||
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.VisualLayoutParsingResponse;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.factory.DocumentGraphFactory;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.visualization.LayoutGridService;
|
||||
import com.knecon.fforesight.service.layoutparser.server.PDFNetInitializer;
|
||||
import com.knecon.fforesight.service.layoutparser.server.utils.BuildDocumentTest;
|
||||
import com.knecon.fforesight.service.viewerdoc.service.ViewerDocumentService;
|
||||
import com.knecon.fforesight.service.viewerdoc.service.PDFTronViewerDocumentService;
|
||||
import com.knecon.fforesight.tenantcommons.TenantsClient;
|
||||
import com.pdftron.pdf.PDFNet;
|
||||
|
||||
import jakarta.annotation.PostConstruct;
|
||||
import lombok.SneakyThrows;
|
||||
|
||||
public class ViewerDocumentTest extends BuildDocumentTest {
|
||||
|
||||
ViewerDocumentService viewerDocumentService = new ViewerDocumentService(null);
|
||||
@Autowired
|
||||
PDFNetInitializer pdfNetInitializer;
|
||||
PDFTronViewerDocumentService viewerDocumentService = new PDFTronViewerDocumentService(null);
|
||||
LayoutGridService layoutGridService = new LayoutGridService(viewerDocumentService);
|
||||
|
||||
|
||||
@BeforeEach
|
||||
public void init() {
|
||||
|
||||
pdfNetInitializer.init();
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
@SneakyThrows
|
||||
public void testViewerDocument() {
|
||||
@ -63,7 +83,7 @@ public class ViewerDocumentTest extends BuildDocumentTest {
|
||||
tableResponse,
|
||||
new VisualLayoutParsingResponse(),
|
||||
Map.of("file", Path.of(fileName).getFileName().toFile().toString()));
|
||||
ViewerDocumentService viewerDocumentService = new ViewerDocumentService(null);
|
||||
PDFTronViewerDocumentService viewerDocumentService = new PDFTronViewerDocumentService(null);
|
||||
LayoutGridService layoutGridService = new LayoutGridService(viewerDocumentService);
|
||||
Document document = DocumentGraphFactory.buildDocumentGraph(LayoutParsingType.DOCUMINE_OLD, classificationDocument);
|
||||
|
||||
|
||||
@ -9,3 +9,5 @@ storage:
|
||||
key: minioadmin
|
||||
secret: minioadmin
|
||||
|
||||
|
||||
|
||||
@ -28,6 +28,11 @@ spring:
|
||||
max-interval: 15000
|
||||
prefetch: 1
|
||||
|
||||
layoutparser:
|
||||
debug: true
|
||||
|
||||
pdftron.license: demo:1650351709282:7bd235e003000000004ec28a6743e1163a085e2115de2536ab6e2cfe5a
|
||||
|
||||
management:
|
||||
endpoint:
|
||||
metrics.enabled: ${monitoring.enabled:false}
|
||||
|
||||
@ -12,7 +12,7 @@ dependencies {
|
||||
implementation("org.apache.pdfbox:pdfbox-tools:${pdfBoxVersion}")
|
||||
implementation("org.slf4j:slf4j-api:1.7.25")
|
||||
implementation("com.knecon.fforesight:tracing-commons:0.5.0")
|
||||
implementation("com.pdftron:PDFNet:10.5.0")
|
||||
implementation("com.pdftron:PDFNet:10.7.0")
|
||||
|
||||
testImplementation("org.apache.logging.log4j:log4j-slf4j-impl:2.22.1")
|
||||
testImplementation("org.junit.jupiter:junit-jupiter")
|
||||
|
||||
@ -1,72 +0,0 @@
|
||||
package com.knecon.fforesight.service.viewerdoc;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
import org.apache.pdfbox.cos.COSName;
|
||||
|
||||
import lombok.AccessLevel;
|
||||
import lombok.experimental.FieldDefaults;
|
||||
|
||||
@FieldDefaults(makeFinal = true, level = AccessLevel.PUBLIC)
|
||||
public class ContentStreams {
|
||||
|
||||
public static Identifier KNECON_LAYOUT = new Identifier("Layout grid", COSName.getPDFName("KNECON_LAYOUT"), true);
|
||||
|
||||
public static Identifier KNECON_VISUAL_PARSING = new Identifier("Layout grid - visual", COSName.getPDFName("KNECON_VISUAL_PARSING"), true);
|
||||
|
||||
public static Identifier KNECON_OCR = new Identifier("OCR", COSName.getPDFName("KNECON_OCR"), false);
|
||||
|
||||
public static Identifier KNECON_OCR_TEXT_DEBUG = new Identifier("OCR Text", COSName.getPDFName("KNECON_OCR_TEXT_DEBUG"), true);
|
||||
|
||||
public static Identifier KNECON_OCR_BBOX_DEBUG = new Identifier("OCR Boxes", COSName.getPDFName("KNECON_OCR_BBOX_DEBUG"), true);
|
||||
|
||||
public static Identifier OTHER = new Identifier("other", COSName.getPDFName("OTHER"), false);
|
||||
|
||||
public static Identifier ESCAPE_START = new Identifier("escape start", COSName.getPDFName("ESCAPE_START"), false);
|
||||
|
||||
public static Identifier ESCAPE_END = new Identifier("escape start", COSName.getPDFName("ESCAPE_END"), false);
|
||||
|
||||
public static Identifier CLEAN_RULINGS = new Identifier("Cleaned Rulings", COSName.getPDFName("KNECON_CLEAN_RULINGS"), true);
|
||||
|
||||
public static Identifier RULINGS = new Identifier("Rulings", COSName.getPDFName("KNECON_RULINGS"), true);
|
||||
|
||||
public static Identifier WORDS = new Identifier("Words", COSName.getPDFName("KNECON_WORDS"), true);
|
||||
|
||||
public static Identifier ZONES = new Identifier("Text Zones", COSName.getPDFName("KNECON_ZONES"), true);
|
||||
|
||||
public static Identifier LINES = new Identifier("Text Lines", COSName.getPDFName("KNECON_LINES"), true);
|
||||
|
||||
public static Identifier CELLS = new Identifier("Cells", COSName.getPDFName("KNECON_CELLS"), true);
|
||||
|
||||
public static Identifier MAIN_BODY = new Identifier("Main Text Body", COSName.getPDFName("KNECON_MAIN_BODY"), true);
|
||||
|
||||
public static Identifier MARKED_CONTENT = new Identifier("Marked content", COSName.getPDFName("KNECON_MARKED_CONTENT"), true);
|
||||
|
||||
public static Identifier NEIGHBOURS = new Identifier("Neighbours", COSName.getPDFName("KNECON_NEIGHBOURS"), true);
|
||||
|
||||
public static Identifier CHARACTERS = new Identifier("Characters", COSName.getPDFName("KNECON_CHARACTERS"), true);
|
||||
|
||||
public static List<Identifier> allContentStreams = List.of(KNECON_LAYOUT,
|
||||
KNECON_VISUAL_PARSING,
|
||||
KNECON_OCR,
|
||||
KNECON_OCR_BBOX_DEBUG,
|
||||
KNECON_OCR_TEXT_DEBUG,
|
||||
OTHER,
|
||||
ESCAPE_START,
|
||||
ESCAPE_END,
|
||||
RULINGS,
|
||||
CLEAN_RULINGS,
|
||||
WORDS,
|
||||
ZONES,
|
||||
LINES,
|
||||
MAIN_BODY,
|
||||
MARKED_CONTENT,
|
||||
NEIGHBOURS,
|
||||
CHARACTERS,
|
||||
CELLS);
|
||||
|
||||
public record Identifier(String name, COSName cosName, boolean optionalContent) {
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
@ -0,0 +1,77 @@
|
||||
package com.knecon.fforesight.service.viewerdoc;
|
||||
|
||||
import org.apache.pdfbox.cos.COSName;
|
||||
|
||||
/*
|
||||
These identifiers are used to mark content in the pdf, such that it may be found later. The markedContentName must therefore be unique.
|
||||
The String "name" is only used to display optional content in the optional content view in the pdf.
|
||||
Therefore, it may be null, if optionalContent is false.
|
||||
If optionalContent is false, the layer will not be created as a OCG, and will not be listed in the OCG view.
|
||||
*/
|
||||
public record LayerIdentifier(String name, String markedContentName) {
|
||||
|
||||
public String markedContentName() {
|
||||
// The prefix KNECON_ is used to identify marked contents as knecon contents later on
|
||||
return KNECON_IDENTIFIER_PREFIX + markedContentName;
|
||||
}
|
||||
|
||||
|
||||
public COSName cosName() {
|
||||
|
||||
return COSName.getPDFName(markedContentName);
|
||||
}
|
||||
|
||||
|
||||
public static final String KNECON_IDENTIFIER_PREFIX = "KNECON_";
|
||||
|
||||
public static final LayerIdentifier KNECON_OCR = new LayerIdentifier(null, "OCR");
|
||||
public static final LayerIdentifier KNECON_OCR_TEXT = new LayerIdentifier(null, "OCR_TEXT");
|
||||
public static final LayerIdentifier KNECON_OCR_LINES = new LayerIdentifier(null, "OCR_LINES");
|
||||
|
||||
// layers
|
||||
// layout grid
|
||||
public static final LayerIdentifier KNECON_LAYOUT = new LayerIdentifier("Layout grid", "LAYOUT");
|
||||
public static final LayerIdentifier KNECON_LAYOUT_SECTION = new LayerIdentifier("Section", "LAYOUT_SECTION");
|
||||
public static final LayerIdentifier KNECON_LAYOUT_PARAGRAPH = new LayerIdentifier("Paragraph ", "LAYOUT_PARAGRAPH");
|
||||
public static final LayerIdentifier KNECON_LAYOUT_KEY_VALUE = new LayerIdentifier("Key-Value Pairs ", "LAYOUT_KEY_VALUE");
|
||||
public static final LayerIdentifier KNECON_LAYOUT_HEADLINE = new LayerIdentifier("Headline", "LAYOUT_HEADLINE");
|
||||
public static final LayerIdentifier KNECON_LAYOUT_HEADER_FOOTER = new LayerIdentifier("Header/Footer", "LAYOUT_HEADER_FOOTER");
|
||||
public static final LayerIdentifier KNECON_LAYOUT_TABLE = new LayerIdentifier("Tables", "LAYOUT_TABLE");
|
||||
public static final LayerIdentifier KNECON_LAYOUT_FIGURES = new LayerIdentifier("Figures", "LAYOUT_FIGURES");
|
||||
public static final LayerIdentifier KNECON_LAYOUT_IMAGES = new LayerIdentifier("Images", "LAYOUT_IMAGES");
|
||||
public static final LayerIdentifier KNECON_LAYOUT_TREE_IDs = new LayerIdentifier("Tree IDs", "LAYOUT_TREE_IDs");
|
||||
|
||||
//layout grid debug
|
||||
public static final LayerIdentifier KNECON_LAYOUT_DEBUG = new LayerIdentifier("Layout elements", "DEBUG_LAYOUT");
|
||||
public static final LayerIdentifier CLEAN_RULINGS = new LayerIdentifier("Classified Rulings", "CLEAN_RULINGS");
|
||||
public static final LayerIdentifier RULINGS = new LayerIdentifier("Rulings", "RULINGS");
|
||||
public static final LayerIdentifier WORDS = new LayerIdentifier("Words", "WORDS");
|
||||
public static final LayerIdentifier ZONES = new LayerIdentifier("Text Zones", "ZONES");
|
||||
public static final LayerIdentifier LINES = new LayerIdentifier("Text Lines", "LINES");
|
||||
public static final LayerIdentifier CELLS = new LayerIdentifier("Cells", "CELLS");
|
||||
public static final LayerIdentifier MAIN_BODY = new LayerIdentifier("Main Text Body", "MAIN_BODY");
|
||||
public static final LayerIdentifier MARKED_CONTENT = new LayerIdentifier("Marked content", "MARKED_CONTENT");
|
||||
public static final LayerIdentifier NEIGHBOURS = new LayerIdentifier("Neighbours", "NEIGHBOURS");
|
||||
public static final LayerIdentifier CHARACTERS = new LayerIdentifier("Characters", "CHARACTERS");
|
||||
|
||||
public static final LayerIdentifier KNECON_VISUAL_PARSING = new LayerIdentifier("Visual Layout Parser", "VISUAL_PARSING");
|
||||
|
||||
//ocr
|
||||
public static final LayerIdentifier KNECON_OCR_DEBUG = new LayerIdentifier("OCR", "OCR_DEBUG");
|
||||
public static final LayerIdentifier KNECON_OCR_TEXT_DEBUG = new LayerIdentifier("OCR Text", "OCR_TEXT_DEBUG");
|
||||
public static final LayerIdentifier KNECON_OCR_BBOX_DEBUG = new LayerIdentifier("OCR Words", "OCR_BBOX_DEBUG");
|
||||
public static final LayerIdentifier KNECON_OCR_LINE_DEBUG = new LayerIdentifier("OCR Lines", "OCR_LINE_DEBUG");
|
||||
public static final LayerIdentifier KNECON_OCR_OVERLAPPED_TEXT = new LayerIdentifier("OCR overlapped Text", "OCR_OVERLAPPED_TEXT_DEBUG");
|
||||
|
||||
//azure idp
|
||||
public static final LayerIdentifier KNECON_AZURE_IDP = new LayerIdentifier("IDP", "IDP");
|
||||
public static final LayerIdentifier IDP_FIGURES = new LayerIdentifier("IDP Figures", "IDP_FIGURES");
|
||||
public static final LayerIdentifier IDP_TABLES = new LayerIdentifier("IDP Tables", "IDP_TABLES");
|
||||
public static final LayerIdentifier IDP_KV_PAIRS = new LayerIdentifier("IDP Key Value Pair", "IDP_KV_PAIRS");
|
||||
public static final LayerIdentifier IDP_SECTIONS = new LayerIdentifier("IDP Sections", "IDP_SECTIONS");
|
||||
public static final LayerIdentifier IDP_LINES = new LayerIdentifier("IDP Lines", "IDP_LINES");
|
||||
public static final LayerIdentifier IDP_PARAGRAPHS = new LayerIdentifier("IDP Paragraphs", "IDP_PARAGRAPHS");
|
||||
public static final LayerIdentifier IDP_LIST = new LayerIdentifier("IDP Lists", "IDP_LISTS");
|
||||
public static final LayerIdentifier IDP_BARCODES = new LayerIdentifier("IDP Barcodes", "IDP_BARCODES");
|
||||
|
||||
}
|
||||
@ -0,0 +1,19 @@
|
||||
package com.knecon.fforesight.service.viewerdoc.layers;
|
||||
|
||||
import com.knecon.fforesight.service.viewerdoc.model.Visualizations;
|
||||
import com.knecon.fforesight.service.viewerdoc.model.VisualizationsOnPage;
|
||||
|
||||
public abstract class AbstractLayerGroup implements LayerGroup {
|
||||
|
||||
protected VisualizationsOnPage getOrCreateVisualizationsOnPage(int page, Visualizations visualizations) {
|
||||
|
||||
if (visualizations.getVisualizationsOnPages().containsKey(page)) {
|
||||
return visualizations.getVisualizationsOnPages()
|
||||
.get(page);
|
||||
}
|
||||
VisualizationsOnPage visualizationsOnPage = VisualizationsOnPage.builder().build();
|
||||
visualizations.getVisualizationsOnPages().put(page, visualizationsOnPage);
|
||||
return visualizationsOnPage;
|
||||
}
|
||||
|
||||
}
|
||||
@ -0,0 +1,49 @@
|
||||
package com.knecon.fforesight.service.viewerdoc.layers;
|
||||
|
||||
import java.awt.Color;
|
||||
import java.util.List;
|
||||
|
||||
import com.knecon.fforesight.service.viewerdoc.LayerIdentifier;
|
||||
import com.knecon.fforesight.service.viewerdoc.model.Visualizations;
|
||||
|
||||
import lombok.Getter;
|
||||
|
||||
public class IdpLayerConfig extends AbstractLayerGroup {
|
||||
|
||||
@Getter
|
||||
public final LayerIdentifier groupIdentifier = LayerIdentifier.KNECON_AZURE_IDP;
|
||||
|
||||
public static final LayerGroup CONFIG_INSTANCE = new IdpLayerConfig();
|
||||
|
||||
protected final Visualizations figures = Visualizations.builder().layer(LayerIdentifier.IDP_FIGURES).visibleByDefault(true).build();
|
||||
protected final Visualizations tables = Visualizations.builder().layer(LayerIdentifier.IDP_TABLES).visibleByDefault(true).build();
|
||||
protected final Visualizations keyValuePairs = Visualizations.builder().layer(LayerIdentifier.IDP_KV_PAIRS).visibleByDefault(true).build();
|
||||
protected final Visualizations paragraphs = Visualizations.builder().layer(LayerIdentifier.IDP_PARAGRAPHS).build();
|
||||
protected final Visualizations sections = Visualizations.builder().layer(LayerIdentifier.IDP_SECTIONS).build();
|
||||
protected final Visualizations lines = Visualizations.builder().layer(LayerIdentifier.IDP_LINES).build();
|
||||
protected final Visualizations lists = Visualizations.builder().layer(LayerIdentifier.IDP_LIST).visibleByDefault(true).build();
|
||||
protected final Visualizations barcodes = Visualizations.builder().layer(LayerIdentifier.IDP_BARCODES).visibleByDefault(true).build();
|
||||
|
||||
protected static final Color TABLE_COLOR = new Color(102, 205, 170);
|
||||
protected static final Color INNER_LINES_COLOR = new Color(255, 175, 175);
|
||||
protected static final Color SECTION_COLOR = new Color(50, 50, 50);
|
||||
protected static final Color SECTION_HEADING_COLOR = new Color(162, 56, 56);
|
||||
protected static final Color TITLE_COLOR = new Color(221, 25, 25);
|
||||
protected static final Color HEADER_FOOTER_COLOR = new Color(171, 131, 6);
|
||||
protected static final Color FOOTNOTE_COLOR = new Color(6, 64, 171);
|
||||
protected static final Color FORMULA_COLOR = new Color(80, 171, 6);
|
||||
protected static final Color PARAGRAPH_COLOR = new Color(70, 130, 180);
|
||||
protected static final Color IMAGE_COLOR = new Color(253, 63, 146);
|
||||
protected static final Color KEY_VALUE_BBOX_COLOR = new Color(0, 39, 85);
|
||||
protected static final Color KEY_COLOR = new Color(30, 92, 172);
|
||||
protected static final Color VALUE_COLOR = new Color(30, 172, 146);
|
||||
protected static final Color LINES_COLOR = new Color(152, 45, 179);
|
||||
|
||||
|
||||
@Override
|
||||
public List<Visualizations> getVisualizations() {
|
||||
|
||||
return List.of(paragraphs, sections, figures, tables, keyValuePairs, lines, lists, barcodes);
|
||||
}
|
||||
|
||||
}
|
||||
@ -0,0 +1,62 @@
|
||||
package com.knecon.fforesight.service.viewerdoc.layers;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
import com.knecon.fforesight.service.viewerdoc.LayerIdentifier;
|
||||
import com.knecon.fforesight.service.viewerdoc.model.Visualizations;
|
||||
|
||||
public interface LayerGroup {
|
||||
|
||||
LayerIdentifier getGroupIdentifier();
|
||||
|
||||
|
||||
List<Visualizations> getVisualizations();
|
||||
|
||||
|
||||
default List<LayerIdentifier> getSubLayers() {
|
||||
|
||||
return getVisualizations().stream()
|
||||
.map(Visualizations::getLayer)
|
||||
.toList();
|
||||
|
||||
}
|
||||
|
||||
|
||||
default boolean isVisibleByDefault() {
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
indicates the sub layers are all optional content
|
||||
*/
|
||||
default boolean subLayersAreOptionalContent() {
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
indicates the LayerGroup is also a optional content group, and should be displayed as such:
|
||||
layer
|
||||
- sublayer0
|
||||
- sublayer1
|
||||
|
||||
see note in specification 8.11.4.3
|
||||
*/
|
||||
default boolean isOptionalContent() {
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
default boolean isEmpty() {
|
||||
|
||||
return getVisualizations().isEmpty();
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
}
|
||||
@ -0,0 +1,73 @@
|
||||
package com.knecon.fforesight.service.viewerdoc.layers;
|
||||
|
||||
import java.awt.Color;
|
||||
import java.util.List;
|
||||
|
||||
import com.knecon.fforesight.service.viewerdoc.LayerIdentifier;
|
||||
import com.knecon.fforesight.service.viewerdoc.model.Standard14EmbeddableFont;
|
||||
import com.knecon.fforesight.service.viewerdoc.model.Visualizations;
|
||||
|
||||
import lombok.Getter;
|
||||
|
||||
public class LayoutDebugLayerConfig extends AbstractLayerGroup {
|
||||
|
||||
@Getter
|
||||
public final LayerIdentifier groupIdentifier = LayerIdentifier.KNECON_LAYOUT_DEBUG;
|
||||
|
||||
public static final LayerGroup CONFIG_INSTANCE = new LayoutDebugLayerConfig();
|
||||
|
||||
protected static final Standard14EmbeddableFont FONT = Standard14EmbeddableFont.helvetica();
|
||||
|
||||
protected static final Color WORDS_COLOR = new Color(68, 84, 147);
|
||||
protected static final Color LINES_COLOR = new Color(152, 45, 179);
|
||||
protected static final Color ZONES_COLOR = new Color(131, 38, 38);
|
||||
|
||||
protected static final Color RULINGS_COLOR = new Color(21, 221, 174);
|
||||
protected static final Color TABLE_RULINGS_COLOR = new Color(255, 175, 175);
|
||||
protected static final Color HEADER_RULING_COLOR = new Color(171, 131, 6);
|
||||
protected static final Color FOOTER_RULING_COLOR = new Color(106, 82, 2);
|
||||
protected static final Color UNDERLINE_RULING_COLOR = new Color(6, 39, 171);
|
||||
protected static final Color STRIKETROUGH_RULING_COLOR = new Color(171, 6, 6);
|
||||
|
||||
protected static final Color CELLS_COLOR = new Color(31, 214, 27);
|
||||
|
||||
protected static final Color MAIN_BODY_COLOR = new Color(171, 131, 6);
|
||||
protected static final Color MARKED_CONTENT_COLOR = new Color(171, 131, 6);
|
||||
|
||||
protected static final List<Color> ROTATING_CHARACTER_COLOR = List.of(new Color(255, 87, 51),
|
||||
new Color(255, 195, 0),
|
||||
new Color(76, 175, 80),
|
||||
new Color(33, 150, 243),
|
||||
new Color(155, 89, 182),
|
||||
new Color(233, 30, 99),
|
||||
new Color(0, 188, 212),
|
||||
new Color(121, 85, 72));
|
||||
|
||||
protected final Visualizations words = Visualizations.builder().layer(LayerIdentifier.WORDS).visibleByDefault(true).build();
|
||||
protected final Visualizations lines = Visualizations.builder().layer(LayerIdentifier.LINES).build();
|
||||
protected final Visualizations zones = Visualizations.builder().layer(LayerIdentifier.ZONES).build();
|
||||
protected final Visualizations mainBody = Visualizations.builder().layer(LayerIdentifier.MAIN_BODY).build();
|
||||
protected final Visualizations clean_rulings = Visualizations.builder().layer(LayerIdentifier.CLEAN_RULINGS).build();
|
||||
protected final Visualizations rulings = Visualizations.builder().layer(LayerIdentifier.RULINGS).build();
|
||||
protected final Visualizations cells = Visualizations.builder().layer(LayerIdentifier.CELLS).build();
|
||||
protected final Visualizations markedContent = Visualizations.builder().layer(LayerIdentifier.MARKED_CONTENT).build();
|
||||
protected final Visualizations neighbours = Visualizations.builder().layer(LayerIdentifier.NEIGHBOURS).build();
|
||||
protected final Visualizations characters = Visualizations.builder().layer(LayerIdentifier.CHARACTERS).build();
|
||||
|
||||
|
||||
public List<Visualizations> getVisualizations() {
|
||||
|
||||
return List.of(characters, //
|
||||
neighbours,//
|
||||
words, //
|
||||
lines, //
|
||||
zones, //
|
||||
rulings, //
|
||||
clean_rulings, //
|
||||
cells, //
|
||||
mainBody, //
|
||||
markedContent //
|
||||
);
|
||||
}
|
||||
|
||||
}
|
||||
@ -0,0 +1,55 @@
|
||||
package com.knecon.fforesight.service.viewerdoc.layers;
|
||||
|
||||
import java.awt.Color;
|
||||
import java.util.List;
|
||||
|
||||
import com.knecon.fforesight.service.viewerdoc.LayerIdentifier;
|
||||
import com.knecon.fforesight.service.viewerdoc.model.Standard14EmbeddableFont;
|
||||
import com.knecon.fforesight.service.viewerdoc.model.Visualizations;
|
||||
|
||||
import lombok.Getter;
|
||||
|
||||
public class LayoutGridLayerConfig extends AbstractLayerGroup {
|
||||
|
||||
@Getter
|
||||
public final LayerIdentifier groupIdentifier = LayerIdentifier.KNECON_LAYOUT;
|
||||
|
||||
public static final LayerGroup CONFIG_INSTANCE = new LayoutGridLayerConfig();
|
||||
|
||||
protected static final float FONT_SIZE = 10f;
|
||||
protected static final float LINE_WIDTH = 1f;
|
||||
protected static final Standard14EmbeddableFont FONT = Standard14EmbeddableFont.helvetica();
|
||||
|
||||
protected static final Color INNER_LINES_COLOR = new Color(255, 175, 175);
|
||||
protected static final Color HEADER_CELL_COLOR = new Color(156, 21, 48);
|
||||
protected static final Color PARAGRAPH_COLOR = new Color(70, 130, 180);
|
||||
|
||||
protected static final Color DUPLICATE_PARAGRAPH_COLOR = new Color(70, 180, 101);
|
||||
protected static final Color TABLE_COLOR = new Color(102, 205, 170);
|
||||
protected static final Color SECTION_COLOR = new Color(50, 50, 50);
|
||||
protected static final Color HEADLINE_COLOR = new Color(162, 56, 56);
|
||||
protected static final Color HEADER_COLOR = new Color(171, 131, 6);
|
||||
protected static final Color IMAGE_COLOR = new Color(253, 63, 146);
|
||||
protected static final Color TREEID_COLOR = new Color(53, 53, 53);
|
||||
protected static final Color KEY_VALUE_BBOX_COLOR = new Color(0, 39, 85);
|
||||
protected static final Color KEY_COLOR = new Color(30, 92, 172);
|
||||
protected static final Color VALUE_COLOR = new Color(30, 172, 146);
|
||||
|
||||
protected final Visualizations sections = Visualizations.builder().layer(LayerIdentifier.KNECON_LAYOUT_SECTION).visibleByDefault(true).build();
|
||||
protected final Visualizations paragraphs = Visualizations.builder().layer(LayerIdentifier.KNECON_LAYOUT_PARAGRAPH).visibleByDefault(true).build();
|
||||
protected final Visualizations headlines = Visualizations.builder().layer(LayerIdentifier.KNECON_LAYOUT_HEADLINE).visibleByDefault(true).build();
|
||||
protected final Visualizations tables = Visualizations.builder().layer(LayerIdentifier.KNECON_LAYOUT_TABLE).visibleByDefault(true).build();
|
||||
protected final Visualizations figures = Visualizations.builder().layer(LayerIdentifier.KNECON_LAYOUT_FIGURES).visibleByDefault(true).build();
|
||||
protected final Visualizations headerFooter = Visualizations.builder().layer(LayerIdentifier.KNECON_LAYOUT_HEADER_FOOTER).visibleByDefault(true).build();
|
||||
protected final Visualizations images = Visualizations.builder().layer(LayerIdentifier.KNECON_LAYOUT_IMAGES).build();
|
||||
protected final Visualizations keyValue = Visualizations.builder().layer(LayerIdentifier.KNECON_LAYOUT_KEY_VALUE).build();
|
||||
protected final Visualizations treeIds = Visualizations.builder().layer(LayerIdentifier.KNECON_LAYOUT_TREE_IDs).build();
|
||||
|
||||
|
||||
@Override
|
||||
public List<Visualizations> getVisualizations() {
|
||||
|
||||
return List.of(headlines, paragraphs, tables, sections, headerFooter, keyValue, figures, images, treeIds);
|
||||
}
|
||||
|
||||
}
|
||||
@ -0,0 +1,38 @@
|
||||
package com.knecon.fforesight.service.viewerdoc.layers;
|
||||
|
||||
import java.awt.Color;
|
||||
import java.util.List;
|
||||
|
||||
import com.knecon.fforesight.service.viewerdoc.LayerIdentifier;
|
||||
import com.knecon.fforesight.service.viewerdoc.model.Visualizations;
|
||||
|
||||
import lombok.Getter;
|
||||
|
||||
public class OcrDebugLayerConfig extends AbstractLayerGroup {
|
||||
|
||||
@Getter
|
||||
public final LayerIdentifier groupIdentifier = LayerIdentifier.KNECON_OCR_DEBUG;
|
||||
|
||||
public static final LayerGroup CONFIG_INSTANCE = new OcrDebugLayerConfig();
|
||||
|
||||
protected static final Color REGULAR_COLOR = new Color(6, 39, 171);
|
||||
protected static final Color BOLD_COLOR = new Color(50, 246, 246);
|
||||
protected static final Color ITALIC_COLOR = new Color(171, 105, 6);
|
||||
protected static final Color BOLD_ITALIC_COLOR = new Color(6, 171, 102);
|
||||
protected static final Color HANDWRITTEN_COLOR = new Color(171, 64, 6);
|
||||
protected static final Color OVERLAPPED_COLOR = new Color(142, 8, 8);
|
||||
protected static final Color TABLE_LINES_COLOR = new Color(21, 221, 174);
|
||||
|
||||
protected final Visualizations debugText = Visualizations.builder().layer(LayerIdentifier.KNECON_OCR_TEXT_DEBUG).visibleByDefault(true).build();
|
||||
protected final Visualizations tableLines = Visualizations.builder().layer(LayerIdentifier.KNECON_OCR_LINE_DEBUG).visibleByDefault(true).build();
|
||||
protected final Visualizations overlappedText = Visualizations.builder().layer(LayerIdentifier.KNECON_OCR_OVERLAPPED_TEXT).visibleByDefault(false).build();
|
||||
protected final Visualizations debugBBox = Visualizations.builder().layer(LayerIdentifier.KNECON_OCR_BBOX_DEBUG).visibleByDefault(false).build();
|
||||
|
||||
|
||||
@Override
|
||||
public List<Visualizations> getVisualizations() {
|
||||
|
||||
return List.of(debugText, tableLines, debugBBox, overlappedText);
|
||||
}
|
||||
|
||||
}
|
||||
@ -0,0 +1,40 @@
|
||||
package com.knecon.fforesight.service.viewerdoc.layers;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
import com.knecon.fforesight.service.viewerdoc.LayerIdentifier;
|
||||
import com.knecon.fforesight.service.viewerdoc.model.Visualizations;
|
||||
|
||||
public class OcrTextLayerConfig extends AbstractLayerGroup {
|
||||
|
||||
protected final Visualizations ocrText = Visualizations.builder().layer(LayerIdentifier.KNECON_OCR_TEXT).build();
|
||||
protected final Visualizations tableLines = Visualizations.builder().layer(LayerIdentifier.KNECON_OCR_LINES).build();
|
||||
|
||||
|
||||
@Override
|
||||
public LayerIdentifier getGroupIdentifier() {
|
||||
|
||||
return LayerIdentifier.KNECON_OCR;
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public List<Visualizations> getVisualizations() {
|
||||
|
||||
return List.of(ocrText, tableLines);
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public boolean subLayersAreOptionalContent() {
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean isOptionalContent() {
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
}
|
||||
@ -1,27 +0,0 @@
|
||||
package com.knecon.fforesight.service.viewerdoc.model;
|
||||
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
|
||||
import lombok.AccessLevel;
|
||||
import lombok.Getter;
|
||||
import lombok.experimental.FieldDefaults;
|
||||
|
||||
@Getter
|
||||
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
|
||||
public class LayoutGrid {
|
||||
|
||||
int numberOfPages;
|
||||
Map<Integer, VisualizationsOnPage> visualizationsPerPages;
|
||||
|
||||
|
||||
public LayoutGrid(int numberOfPages) {
|
||||
|
||||
this.numberOfPages = numberOfPages;
|
||||
this.visualizationsPerPages = new HashMap<>();
|
||||
for (int i = 0; i < numberOfPages; i++) {
|
||||
this.visualizationsPerPages.put(i, VisualizationsOnPage.builder().build());
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
@ -1,10 +1,12 @@
|
||||
package com.knecon.fforesight.service.viewerdoc.service.pdftron;
|
||||
package com.knecon.fforesight.service.viewerdoc.model;
|
||||
|
||||
import java.util.Deque;
|
||||
import java.util.Iterator;
|
||||
import java.util.LinkedList;
|
||||
import java.util.Set;
|
||||
|
||||
import com.knecon.fforesight.service.viewerdoc.LayerIdentifier;
|
||||
|
||||
public class MarkedContentStack {
|
||||
|
||||
private final Deque<MarkedContent> stack = new LinkedList<>();
|
||||
@ -44,7 +46,23 @@ public class MarkedContentStack {
|
||||
}
|
||||
|
||||
|
||||
public boolean currentMarkedContentContainsAny(Set<String> names) {
|
||||
public boolean currentMarkedContentContainsNone(Set<String> names) {
|
||||
|
||||
if (stack.isEmpty()) {
|
||||
return true;
|
||||
}
|
||||
Iterator<MarkedContent> markedContentIterator = stack.descendingIterator();
|
||||
while (markedContentIterator.hasNext()) {
|
||||
var markedContent = markedContentIterator.next();
|
||||
if (names.contains(markedContent.name())) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
public boolean currentMarkedContentIsKneconContent() {
|
||||
|
||||
if (stack.isEmpty()) {
|
||||
return false;
|
||||
@ -52,11 +70,12 @@ public class MarkedContentStack {
|
||||
Iterator<MarkedContent> markedContentIterator = stack.descendingIterator();
|
||||
while (markedContentIterator.hasNext()) {
|
||||
var markedContent = markedContentIterator.next();
|
||||
if (names.contains(markedContent.name())) {
|
||||
if (markedContent.name().startsWith(LayerIdentifier.KNECON_IDENTIFIER_PREFIX)) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
|
||||
}
|
||||
|
||||
|
||||
@ -1,10 +0,0 @@
|
||||
package com.knecon.fforesight.service.viewerdoc.model;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
import org.apache.pdfbox.contentstream.operator.Operator;
|
||||
import org.apache.pdfbox.cos.COSBase;
|
||||
|
||||
public record OperatorWithArguments(Operator operator, List<COSBase> arguments) {
|
||||
|
||||
}
|
||||
@ -1,13 +1,14 @@
|
||||
package com.knecon.fforesight.service.viewerdoc.model;
|
||||
|
||||
import java.awt.Color;
|
||||
import java.awt.geom.AffineTransform;
|
||||
import java.awt.geom.Point2D;
|
||||
import java.util.Optional;
|
||||
|
||||
import org.apache.pdfbox.pdmodel.graphics.state.RenderingMode;
|
||||
import org.apache.pdfbox.util.Matrix;
|
||||
|
||||
public record PlacedText(String text, Point2D lineStart, Color color, float fontSize, EmbeddableFont font, Optional<Matrix> textMatrix, Optional<RenderingMode> renderingMode) {
|
||||
public record PlacedText(String text, Point2D lineStart, Color color, float fontSize, EmbeddableFont font, Optional<AffineTransform> textMatrix, Optional<RenderingMode> renderingMode) {
|
||||
|
||||
public static PlacedText textFacingUp(String text, Point2D lineStart, float fontSize, Color color, EmbeddableFont font) {
|
||||
|
||||
|
||||
@ -3,7 +3,7 @@ package com.knecon.fforesight.service.viewerdoc.model;
|
||||
import java.util.LinkedHashMap;
|
||||
import java.util.Map;
|
||||
|
||||
import com.knecon.fforesight.service.viewerdoc.ContentStreams;
|
||||
import com.knecon.fforesight.service.viewerdoc.LayerIdentifier;
|
||||
|
||||
import lombok.AccessLevel;
|
||||
import lombok.AllArgsConstructor;
|
||||
@ -17,9 +17,10 @@ import lombok.experimental.FieldDefaults;
|
||||
@FieldDefaults(level = AccessLevel.PRIVATE)
|
||||
public class Visualizations {
|
||||
|
||||
ContentStreams.Identifier layer;
|
||||
LayerIdentifier layer;
|
||||
@Builder.Default
|
||||
Map<Integer, VisualizationsOnPage> visualizationsOnPages = new LinkedHashMap<>();
|
||||
boolean layerVisibilityDefaultValue;
|
||||
|
||||
boolean visibleByDefault;
|
||||
|
||||
}
|
||||
|
||||
@ -14,6 +14,7 @@ import lombok.experimental.FieldDefaults;
|
||||
public class VisualizationsOnPage {
|
||||
|
||||
boolean makePathsInvisible;
|
||||
boolean inDeviceCoordinates;
|
||||
@Builder.Default
|
||||
List<PlacedText> placedTexts = new LinkedList<>();
|
||||
@Builder.Default
|
||||
|
||||
@ -1,7 +0,0 @@
|
||||
package com.knecon.fforesight.service.viewerdoc.pdf;
|
||||
|
||||
import com.knecon.fforesight.service.viewerdoc.ContentStreams;
|
||||
|
||||
public record ClassifiedContentStream(SinglePDContentStream contentStream, ContentStreams.Identifier classification) {
|
||||
|
||||
}
|
||||
@ -1,61 +0,0 @@
|
||||
package com.knecon.fforesight.service.viewerdoc.pdf;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
|
||||
import org.apache.pdfbox.contentstream.PDContentStream;
|
||||
import org.apache.pdfbox.io.RandomAccessInputStream;
|
||||
import org.apache.pdfbox.io.RandomAccessRead;
|
||||
import org.apache.pdfbox.pdmodel.PDResources;
|
||||
import org.apache.pdfbox.pdmodel.common.PDRectangle;
|
||||
import org.apache.pdfbox.pdmodel.common.PDStream;
|
||||
import org.apache.pdfbox.util.Matrix;
|
||||
|
||||
import lombok.AccessLevel;
|
||||
import lombok.Getter;
|
||||
import lombok.RequiredArgsConstructor;
|
||||
import lombok.experimental.FieldDefaults;
|
||||
|
||||
@Getter
|
||||
@RequiredArgsConstructor
|
||||
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
|
||||
public class SinglePDContentStream implements PDContentStream {
|
||||
|
||||
PDStream pdStream;
|
||||
|
||||
|
||||
@Override
|
||||
public InputStream getContents() throws IOException {
|
||||
|
||||
return new RandomAccessInputStream(getContentsForRandomAccess());
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public RandomAccessRead getContentsForRandomAccess() throws IOException {
|
||||
|
||||
return pdStream.getCOSObject().createView();
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public PDResources getResources() {
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public PDRectangle getBBox() {
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public Matrix getMatrix() {
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
}
|
||||
@ -1,121 +0,0 @@
|
||||
package com.knecon.fforesight.service.viewerdoc.service;
|
||||
|
||||
import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
import java.util.Optional;
|
||||
|
||||
import org.apache.pdfbox.contentstream.PDContentStream;
|
||||
import org.apache.pdfbox.contentstream.operator.OperatorName;
|
||||
import org.apache.pdfbox.cos.COSName;
|
||||
import org.apache.pdfbox.pdmodel.PDPage;
|
||||
|
||||
import com.knecon.fforesight.service.viewerdoc.ContentStreams;
|
||||
import com.knecon.fforesight.service.viewerdoc.model.OperatorWithArguments;
|
||||
import com.knecon.fforesight.service.viewerdoc.pdf.ClassifiedContentStream;
|
||||
import com.knecon.fforesight.service.viewerdoc.pdf.SinglePDContentStream;
|
||||
|
||||
import lombok.SneakyThrows;
|
||||
import lombok.experimental.UtilityClass;
|
||||
|
||||
@UtilityClass
|
||||
public class ContentStreamClassifier {
|
||||
|
||||
public List<ClassifiedContentStream> getClassifiedContentStreams(PDPage page) {
|
||||
|
||||
List<SinglePDContentStream> streams = new LinkedList<>();
|
||||
page.getContentStreams().forEachRemaining(stream -> streams.add(new SinglePDContentStream(stream)));
|
||||
return ContentStreamClassifier.classifySingleContentStreams(page, streams);
|
||||
}
|
||||
|
||||
|
||||
public List<ClassifiedContentStream> classifySingleContentStreams(PDPage page, List<SinglePDContentStream> streams) {
|
||||
|
||||
return streams.stream().map(singlePDContentStream -> classifySingleContentStream(page, singlePDContentStream)).toList();
|
||||
}
|
||||
|
||||
|
||||
private ClassifiedContentStream classifySingleContentStream(PDPage page, SinglePDContentStream singlePDContentStream) {
|
||||
|
||||
ContentStreams.Identifier classification = classifyContentStream(singlePDContentStream, page);
|
||||
return new ClassifiedContentStream(singlePDContentStream, classification);
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* We assume all of our layers are written escaped, so only unknown content streams need to be escaped.
|
||||
*
|
||||
* @param classifiers List of all content streams of a page with their classification
|
||||
* @return false, if any content stream with classification other is not prefixed with an ESCAPE_START and suffixed with an ESCAPE_END
|
||||
*/
|
||||
public boolean areAllContentStreamsEscaped(List<ClassifiedContentStream> classifiers) {
|
||||
|
||||
int escapeDepth = 0;
|
||||
for (ClassifiedContentStream classifier : classifiers) {
|
||||
if (classifier.classification().equals(ContentStreams.OTHER) && escapeDepth == 0) {
|
||||
return false;
|
||||
}
|
||||
if (classifier.classification().equals(ContentStreams.ESCAPE_START)) {
|
||||
escapeDepth++;
|
||||
}
|
||||
if (classifier.classification().equals(ContentStreams.ESCAPE_END)) {
|
||||
escapeDepth--;
|
||||
}
|
||||
}
|
||||
return escapeDepth == 0;
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
public ContentStreams.Identifier classifyContentStream(PDContentStream contentStream, PDPage page) {
|
||||
|
||||
List<OperatorWithArguments> operatorsWithArguments = ContentStreamUtility.parseLeadingOperators(contentStream, 2);
|
||||
if (operatorsWithArguments.isEmpty()) {
|
||||
return ContentStreams.OTHER;
|
||||
}
|
||||
OperatorWithArguments firstOperator = operatorsWithArguments.get(0);
|
||||
|
||||
// If we wrap the content streams we append and prepend a content stream with exactly one operator "q" or "Q".
|
||||
if (operatorsWithArguments.size() == 1) {
|
||||
if (firstOperator.operator().getName().equals(OperatorName.SAVE)) {
|
||||
return ContentStreams.ESCAPE_START;
|
||||
}
|
||||
if (firstOperator.operator().getName().equals(OperatorName.RESTORE)) {
|
||||
return ContentStreams.ESCAPE_END;
|
||||
}
|
||||
}
|
||||
|
||||
// In previous versions we did not set a marked content with an explicit name. Instead, we wrote an optional content group (OCG) with the name "Layout grid".
|
||||
// This OCG is then assigned a COSName by PDFBox. Usually its "oc1".
|
||||
// Thus, in order to find this name we need to look in the page resources to find the COSName assigned to the OCG.
|
||||
// This COSName can then be found as an argument for the first operator in the content stream.
|
||||
if (firstOperator.operator().getName().equals(OperatorName.BEGIN_MARKED_CONTENT_SEQ)) {
|
||||
Optional<COSName> layoutGridOCGName = ContentStreamUtility.findLayoutGridOCGName(page);
|
||||
if (layoutGridOCGName.isPresent()) {
|
||||
if (arumentsContainLayoutGridOCG(firstOperator, layoutGridOCGName.get())) {
|
||||
return ContentStreams.KNECON_LAYOUT;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (!firstOperator.operator().getName().equals(OperatorName.BEGIN_MARKED_CONTENT)) {
|
||||
return ContentStreams.OTHER;
|
||||
}
|
||||
|
||||
Optional<COSName> firstCOSNameFromArguments = firstOperator.arguments().stream().filter(c -> c instanceof COSName).map(c -> (COSName) c).findFirst();
|
||||
|
||||
if (firstCOSNameFromArguments.isEmpty()) {
|
||||
return ContentStreams.OTHER;
|
||||
}
|
||||
|
||||
var cosName = firstCOSNameFromArguments.get();
|
||||
|
||||
return ContentStreams.allContentStreams.stream().filter(identifier -> identifier.cosName().equals(cosName)).findAny().orElse(ContentStreams.OTHER);
|
||||
}
|
||||
|
||||
|
||||
private static boolean arumentsContainLayoutGridOCG(OperatorWithArguments operator, COSName layoutGridOCGName) {
|
||||
|
||||
return operator.arguments().stream().filter(c -> c instanceof COSName).map(c -> (COSName) c).anyMatch(cosName -> cosName.equals(layoutGridOCGName));
|
||||
}
|
||||
|
||||
}
|
||||
@ -1,77 +0,0 @@
|
||||
package com.knecon.fforesight.service.viewerdoc.service;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
import java.util.Optional;
|
||||
import java.util.Set;
|
||||
|
||||
import org.apache.pdfbox.contentstream.PDContentStream;
|
||||
import org.apache.pdfbox.contentstream.operator.Operator;
|
||||
import org.apache.pdfbox.cos.COSBase;
|
||||
import org.apache.pdfbox.cos.COSName;
|
||||
import org.apache.pdfbox.cos.COSString;
|
||||
import org.apache.pdfbox.pdfparser.PDFStreamParser;
|
||||
import org.apache.pdfbox.pdmodel.PDPage;
|
||||
import org.apache.pdfbox.pdmodel.common.PDStream;
|
||||
|
||||
import com.knecon.fforesight.service.viewerdoc.ContentStreams;
|
||||
import com.knecon.fforesight.service.viewerdoc.model.OperatorWithArguments;
|
||||
import com.knecon.fforesight.service.viewerdoc.pdf.ClassifiedContentStream;
|
||||
import com.knecon.fforesight.service.viewerdoc.pdf.SinglePDContentStream;
|
||||
|
||||
import lombok.experimental.UtilityClass;
|
||||
|
||||
@UtilityClass
|
||||
public class ContentStreamUtility {
|
||||
|
||||
public static List<OperatorWithArguments> parseLeadingOperators(PDContentStream contentStream,
|
||||
int numberOfOperatorsToRead) throws IOException {
|
||||
|
||||
List<COSBase> arguments = new ArrayList<>();
|
||||
PDFStreamParser parser = new PDFStreamParser(contentStream);
|
||||
List<OperatorWithArguments> operatorsWithArguments = new LinkedList<>();
|
||||
for (int i = 0; i < numberOfOperatorsToRead; ) {
|
||||
Object token = parser.parseNextToken();
|
||||
if (token == null) {
|
||||
break;
|
||||
}
|
||||
if (token instanceof Operator operator) {
|
||||
operatorsWithArguments.add(new OperatorWithArguments(operator, arguments));
|
||||
arguments = new ArrayList<>();
|
||||
i++;
|
||||
} else {
|
||||
arguments.add((COSBase) token);
|
||||
}
|
||||
|
||||
}
|
||||
return operatorsWithArguments;
|
||||
}
|
||||
|
||||
|
||||
public static Optional<COSName> findLayoutGridOCGName(PDPage page) {
|
||||
|
||||
var resourceIterator = page.getResources().getPropertiesNames();
|
||||
for (COSName cosName : resourceIterator) {
|
||||
COSBase cosBase = page.getResources().getProperties(cosName).getCOSObject().getDictionaryObject(COSName.NAME);
|
||||
if (cosBase instanceof COSString string) {
|
||||
if (ContentStreams.KNECON_LAYOUT.name().equals(string.getString())) {
|
||||
return Optional.of(cosName);
|
||||
}
|
||||
}
|
||||
}
|
||||
return Optional.empty();
|
||||
}
|
||||
|
||||
|
||||
public static List<PDStream> removeLayerFromContentStreams(Set<ContentStreams.Identifier> layers, List<ClassifiedContentStream> classifiers) {
|
||||
|
||||
return classifiers.stream()
|
||||
.filter(classifiedContentStream -> !layers.contains(classifiedContentStream.classification()))
|
||||
.map(ClassifiedContentStream::contentStream)
|
||||
.map(SinglePDContentStream::getPdStream)
|
||||
.toList();
|
||||
}
|
||||
|
||||
}
|
||||
@ -1,27 +0,0 @@
|
||||
package com.knecon.fforesight.service.viewerdoc.service;
|
||||
|
||||
import java.io.File;
|
||||
import java.util.List;
|
||||
|
||||
import com.knecon.fforesight.service.viewerdoc.ContentStreams;
|
||||
import com.knecon.fforesight.service.viewerdoc.model.Visualizations;
|
||||
|
||||
import io.micrometer.observation.ObservationRegistry;
|
||||
|
||||
public interface IViewerDocumentService {
|
||||
|
||||
void addVisualizationsOnPage(File originFile, File destinationFile, List<Visualizations> visualizations);
|
||||
|
||||
default void enrichObservation(ObservationRegistry registry, int numberOfPages, List<ContentStreams.Identifier> layers) {
|
||||
|
||||
if (registry == null || registry.getCurrentObservation() == null || registry.isNoop()) {
|
||||
return;
|
||||
}
|
||||
registry.getCurrentObservation().highCardinalityKeyValue("numberOfPages", String.valueOf(numberOfPages));
|
||||
for (int i = 0; i < layers.size(); i++) {
|
||||
ContentStreams.Identifier layer = layers.get(i);
|
||||
|
||||
registry.getCurrentObservation().highCardinalityKeyValue("layer_" + i, String.valueOf(layer.name()));
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -1,4 +1,4 @@
|
||||
package com.knecon.fforesight.service.viewerdoc.service.pdftron;
|
||||
package com.knecon.fforesight.service.viewerdoc.service;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.FileInputStream;
|
||||
@ -11,16 +11,17 @@ import java.util.Map;
|
||||
import java.util.Set;
|
||||
import java.util.function.Function;
|
||||
import java.util.stream.Collectors;
|
||||
import java.util.stream.Stream;
|
||||
|
||||
import org.apache.pdfbox.cos.COSName;
|
||||
|
||||
import com.knecon.fforesight.service.viewerdoc.ContentStreams;
|
||||
import com.knecon.fforesight.service.viewerdoc.LayerIdentifier;
|
||||
import com.knecon.fforesight.service.viewerdoc.layers.IdpLayerConfig;
|
||||
import com.knecon.fforesight.service.viewerdoc.layers.LayerGroup;
|
||||
import com.knecon.fforesight.service.viewerdoc.layers.LayoutDebugLayerConfig;
|
||||
import com.knecon.fforesight.service.viewerdoc.layers.LayoutGridLayerConfig;
|
||||
import com.knecon.fforesight.service.viewerdoc.layers.OcrDebugLayerConfig;
|
||||
import com.knecon.fforesight.service.viewerdoc.model.EmbeddableFont;
|
||||
import com.knecon.fforesight.service.viewerdoc.model.PlacedText;
|
||||
import com.knecon.fforesight.service.viewerdoc.model.Visualizations;
|
||||
import com.knecon.fforesight.service.viewerdoc.model.VisualizationsOnPage;
|
||||
import com.knecon.fforesight.service.viewerdoc.service.IViewerDocumentService;
|
||||
import com.pdftron.pdf.ElementBuilder;
|
||||
import com.pdftron.pdf.ElementReader;
|
||||
import com.pdftron.pdf.ElementWriter;
|
||||
@ -39,16 +40,19 @@ import lombok.extern.slf4j.Slf4j;
|
||||
|
||||
@Slf4j
|
||||
@RequiredArgsConstructor
|
||||
public class PDFTronViewerDocumentService implements IViewerDocumentService {
|
||||
public class PDFTronViewerDocumentService {
|
||||
|
||||
private final ObservationRegistry registry;
|
||||
|
||||
public static final List<LayerGroup> ALL_LAYERS_WITH_OPTIONAL_CONTENT = List.of(LayoutGridLayerConfig.CONFIG_INSTANCE,
|
||||
OcrDebugLayerConfig.CONFIG_INSTANCE,
|
||||
LayoutDebugLayerConfig.CONFIG_INSTANCE,
|
||||
IdpLayerConfig.CONFIG_INSTANCE);
|
||||
|
||||
|
||||
@Override
|
||||
@Observed(name = "PDFTronViewerDocumentService", contextualName = "add-visualizations")
|
||||
@SneakyThrows
|
||||
public synchronized void addVisualizationsOnPage(File originFile, File destinationFile, List<Visualizations> visualizations) {
|
||||
|
||||
@Observed(name = "PDFTronViewerDocumentService", contextualName = "add-visualizations")
|
||||
public synchronized void addLayerGroups(File originFile, File destinationFile, List<LayerGroup> layerGroups) {
|
||||
|
||||
// originFile and destinationFile might be the same, so we use a temp file.
|
||||
// Otherwise, saving the document might corrupt the file
|
||||
@ -62,63 +66,73 @@ public class PDFTronViewerDocumentService implements IViewerDocumentService {
|
||||
) {
|
||||
enrichObservation(registry,
|
||||
pdfDoc.getPageCount(),
|
||||
visualizations.stream()
|
||||
layerGroups.stream()
|
||||
.map(LayerGroup::getVisualizations)
|
||||
.flatMap(Collection::stream)
|
||||
.map(Visualizations::getLayer)
|
||||
.toList());
|
||||
|
||||
Map<ContentStreams.Identifier, Group> groupMap = PdftronLayerUtility.addLayersToDocument(visualizations, pdfDoc);
|
||||
Map<LayerIdentifier, Group> groupMap = PdftronLayerUtility.addLayersToDocument(layerGroups, pdfDoc);
|
||||
|
||||
Map<EmbeddableFont, Font> fontMap = buildFontMap(visualizations, pdfDoc);
|
||||
Map<EmbeddableFont, Font> fontMap = buildFontMap(layerGroups, pdfDoc);
|
||||
|
||||
Set<String> markedContentToDraw = extractMarkedContentNames(visualizations.stream()
|
||||
.map(Visualizations::getLayer));
|
||||
|
||||
Set<String> kneconMarkedContents = extractMarkedContentNames(ContentStreams.allContentStreams.stream());
|
||||
Set<String> markedContentToDraw = mapMarkedContentNames(layerGroups);
|
||||
|
||||
PageContentCleaner pageContentCleaner = PageContentCleaner.builder()
|
||||
.writer(pageWriter)
|
||||
.reader(reader)
|
||||
.elementBuilder(builder)
|
||||
.markedContentToDraw(markedContentToDraw)
|
||||
.kneconMarkedContents(kneconMarkedContents)
|
||||
.markedContentToRemove(markedContentToDraw)
|
||||
.build();
|
||||
|
||||
VisualizationWriter visualizationWriter = VisualizationWriter.builder()
|
||||
.writer(pageWriter)
|
||||
.builder(builder)
|
||||
.groupMap(groupMap)
|
||||
.visualizations(visualizations)
|
||||
.layerGroups(layerGroups)
|
||||
.fontMap(fontMap)
|
||||
.build();
|
||||
|
||||
int pageNumber = 0;
|
||||
boolean isCurrentVersion = ViewerDocVersioningUtility.docIsCurrentVersion(pdfDoc);
|
||||
|
||||
int pageNumber = 1;
|
||||
for (PageIterator iterator = pdfDoc.getPageIterator(); iterator.hasNext(); pageNumber++) {
|
||||
|
||||
Page page = iterator.next();
|
||||
|
||||
pageContentCleaner.cleanPage(page);
|
||||
if (isCurrentVersion) {
|
||||
pageContentCleaner.removeMarkedContent(page);
|
||||
}
|
||||
|
||||
visualizationWriter.drawVisualizationsOnPage(pageNumber, page);
|
||||
|
||||
}
|
||||
|
||||
ViewerDocVersioningUtility.setVersionInDocument(pdfDoc);
|
||||
|
||||
saveDocument(pdfDoc, destinationFile);
|
||||
} finally {
|
||||
assert !tmpFile.toFile().exists() || tmpFile.toFile().delete();
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
private static Set<String> extractMarkedContentNames(Stream<ContentStreams.Identifier> visualizations) {
|
||||
private static Set<String> mapMarkedContentNames(List<LayerGroup> layerGroups) {
|
||||
|
||||
return visualizations.map(ContentStreams.Identifier::cosName)
|
||||
.map(COSName::getName)
|
||||
return layerGroups.stream()
|
||||
.map(LayerGroup::getVisualizations)
|
||||
.flatMap(Collection::stream)
|
||||
.map(Visualizations::getLayer)
|
||||
.map(LayerIdentifier::name)
|
||||
.collect(Collectors.toSet());
|
||||
}
|
||||
|
||||
|
||||
private static Map<EmbeddableFont, Font> buildFontMap(List<Visualizations> visualizations, PDFDoc pdfDoc) {
|
||||
private static Map<EmbeddableFont, Font> buildFontMap(List<LayerGroup> layerGroups, PDFDoc pdfDoc) {
|
||||
|
||||
return visualizations.stream()
|
||||
return layerGroups.stream()
|
||||
.map(LayerGroup::getVisualizations)
|
||||
.flatMap(Collection::stream)
|
||||
.map(Visualizations::getVisualizationsOnPages)
|
||||
.map(Map::values)
|
||||
.flatMap(Collection::stream)
|
||||
@ -146,4 +160,18 @@ public class PDFTronViewerDocumentService implements IViewerDocumentService {
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
private void enrichObservation(ObservationRegistry registry, int numberOfPages, List<LayerIdentifier> layers) {
|
||||
|
||||
if (registry == null || registry.getCurrentObservation() == null || registry.isNoop()) {
|
||||
return;
|
||||
}
|
||||
registry.getCurrentObservation().highCardinalityKeyValue("numberOfPages", String.valueOf(numberOfPages));
|
||||
for (int i = 0; i < layers.size(); i++) {
|
||||
LayerIdentifier layer = layers.get(i);
|
||||
|
||||
registry.getCurrentObservation().highCardinalityKeyValue("layer_" + i, String.valueOf(layer.name()));
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
@ -0,0 +1,83 @@
|
||||
package com.knecon.fforesight.service.viewerdoc.service;
|
||||
|
||||
import java.util.Set;
|
||||
|
||||
import com.knecon.fforesight.service.viewerdoc.model.MarkedContentStack;
|
||||
import com.pdftron.pdf.Element;
|
||||
import com.pdftron.pdf.ElementBuilder;
|
||||
import com.pdftron.pdf.ElementReader;
|
||||
import com.pdftron.pdf.ElementWriter;
|
||||
import com.pdftron.pdf.Page;
|
||||
|
||||
import lombok.AccessLevel;
|
||||
import lombok.Builder;
|
||||
import lombok.SneakyThrows;
|
||||
import lombok.experimental.FieldDefaults;
|
||||
|
||||
@Builder
|
||||
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
|
||||
public class PageContentCleaner {
|
||||
|
||||
ElementWriter writer;
|
||||
ElementReader reader;
|
||||
ElementBuilder elementBuilder;
|
||||
Set<String> markedContentToRemove;
|
||||
|
||||
@Builder.Default
|
||||
MarkedContentStack markedContentStack = new MarkedContentStack();
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
public void removeMarkedContent(Page page) {
|
||||
|
||||
begin(page);
|
||||
copyElementsExceptMarkedContentToRemove();
|
||||
end();
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
private void begin(Page page) {
|
||||
|
||||
writer.begin(page, ElementWriter.e_replacement, false, true, page.getResourceDict());
|
||||
reader.begin(page);
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
private void end() {
|
||||
|
||||
writer.end();
|
||||
reader.end();
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
private void copyElementsExceptMarkedContentToRemove() {
|
||||
|
||||
for (Element element = reader.next(); element != null; element = reader.next()) {
|
||||
switch (element.getType()) {
|
||||
case Element.e_marked_content_begin -> {
|
||||
markedContentStack.enterMarkedContent(element.getMCTag().getName());
|
||||
if (markedContentStack.currentMarkedContentContainsNone(markedContentToRemove)) {
|
||||
writer.writeElement(element);
|
||||
}
|
||||
}
|
||||
case Element.e_marked_content_end -> {
|
||||
if (markedContentStack.currentMarkedContentContainsNone(markedContentToRemove)) {
|
||||
writer.writeElement(element);
|
||||
}
|
||||
markedContentStack.leaveMarkedContent();
|
||||
}
|
||||
default -> {
|
||||
if (markedContentStack.currentMarkedContentContainsNone(markedContentToRemove)) {
|
||||
writer.writeElement(element);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
@ -0,0 +1,187 @@
|
||||
package com.knecon.fforesight.service.viewerdoc.service;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collections;
|
||||
import java.util.HashMap;
|
||||
import java.util.LinkedHashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Optional;
|
||||
|
||||
import com.knecon.fforesight.service.viewerdoc.LayerIdentifier;
|
||||
import com.knecon.fforesight.service.viewerdoc.layers.LayerGroup;
|
||||
import com.knecon.fforesight.service.viewerdoc.model.Visualizations;
|
||||
import com.pdftron.common.PDFNetException;
|
||||
import com.pdftron.pdf.PDFDoc;
|
||||
import com.pdftron.pdf.ocg.Config;
|
||||
import com.pdftron.pdf.ocg.Group;
|
||||
import com.pdftron.sdf.Obj;
|
||||
|
||||
import lombok.SneakyThrows;
|
||||
import lombok.experimental.UtilityClass;
|
||||
|
||||
@UtilityClass
|
||||
public class PdftronLayerUtility {
|
||||
|
||||
@SneakyThrows
|
||||
public Map<LayerIdentifier, Group> addLayersToDocument(List<LayerGroup> layerGroups, PDFDoc pdfDoc) {
|
||||
|
||||
Map<LayerIdentifier, Group> optionalContentGroupMap = new HashMap<>();
|
||||
|
||||
for (var layerGroup : layerGroups) {
|
||||
|
||||
if (!layerGroup.subLayersAreOptionalContent() || layerGroup.isEmpty()) {
|
||||
continue;
|
||||
}
|
||||
|
||||
if (layerGroup.isOptionalContent()) {
|
||||
Group group = addLayerToDocument(pdfDoc, layerGroup.getGroupIdentifier().name(), layerGroup.isVisibleByDefault());
|
||||
optionalContentGroupMap.put(layerGroup.getGroupIdentifier(), group);
|
||||
}
|
||||
|
||||
if (layerGroup.subLayersAreOptionalContent()) {
|
||||
for (Visualizations subLayer : layerGroup.getVisualizations()) {
|
||||
Group subGroup = addLayerToDocument(pdfDoc, subLayer.getLayer().name(), layerGroup.isVisibleByDefault());
|
||||
optionalContentGroupMap.put(subLayer.getLayer(), subGroup);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
setOrderArrayForPresentGroups(pdfDoc, PDFTronViewerDocumentService.ALL_LAYERS_WITH_OPTIONAL_CONTENT);
|
||||
|
||||
return optionalContentGroupMap;
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
public void setOrderArrayForPresentGroups(PDFDoc pdfDoc, List<LayerGroup> layerGroups) {
|
||||
|
||||
Config cfg = getConfig(pdfDoc);
|
||||
Obj orderArray = pdfDoc.createIndirectArray();
|
||||
|
||||
Map<String, Group> groupMap = findAllGroupsInDocAsMap(pdfDoc);
|
||||
|
||||
for (var layerGroup : layerGroups) {
|
||||
|
||||
Obj childOrderArray;
|
||||
|
||||
if (!layerGroup.subLayersAreOptionalContent()) {
|
||||
continue;
|
||||
}
|
||||
|
||||
if (layerGroup.isOptionalContent() && groupMap.containsKey(layerGroup.getGroupIdentifier().name())) {
|
||||
Group group = groupMap.remove(layerGroup.getGroupIdentifier().name());
|
||||
group.setInitialState(cfg, layerGroup.isVisibleByDefault());
|
||||
orderArray.pushBack(group.getSDFObj());
|
||||
childOrderArray = pdfDoc.createIndirectArray();
|
||||
orderArray.pushBack(childOrderArray);
|
||||
} else {
|
||||
childOrderArray = orderArray;
|
||||
}
|
||||
|
||||
for (Visualizations subLayer : layerGroup.getVisualizations()) {
|
||||
if (groupMap.containsKey(subLayer.getLayer().name())) {
|
||||
Group group = groupMap.remove(subLayer.getLayer().name());
|
||||
group.setInitialState(cfg, subLayer.isVisibleByDefault());
|
||||
childOrderArray.pushBack(group.getSDFObj());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (!groupMap.isEmpty()) {
|
||||
for (Group group : groupMap.values()) {
|
||||
orderArray.pushBack(group.getSDFObj());
|
||||
}
|
||||
}
|
||||
|
||||
cfg.setOrder(orderArray);
|
||||
cfg.getSDFObj().putText("ListMode", "VisiblePages");
|
||||
}
|
||||
|
||||
|
||||
private static Map<String, Group> findAllGroupsInDocAsMap(PDFDoc pdfDoc) throws PDFNetException {
|
||||
|
||||
Map<String, Group> groupMap = new LinkedHashMap<>();
|
||||
|
||||
List<Group> presentGroups = findAllGroupsInDoc(pdfDoc);
|
||||
|
||||
for (Group group : presentGroups) {
|
||||
groupMap.put(group.getName(), group);
|
||||
}
|
||||
return groupMap;
|
||||
}
|
||||
|
||||
|
||||
private static Config getConfig(PDFDoc pdfDoc) throws PDFNetException {
|
||||
|
||||
Config cfg = pdfDoc.getOCGConfig();
|
||||
if (cfg == null) {
|
||||
cfg = Config.create(pdfDoc, true);
|
||||
}
|
||||
cfg.setName("knecon debug layer order");
|
||||
cfg.setCreator("Knecon Technology GmbH");
|
||||
return cfg;
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
private Group addLayerToDocument(PDFDoc doc, String layerName, boolean layerVisibilityDefaultValue) {
|
||||
|
||||
Optional<Group> existingGroup = findGroupInDoc(doc, layerName);
|
||||
|
||||
if (existingGroup.isPresent()) {
|
||||
return existingGroup.get();
|
||||
}
|
||||
|
||||
return addNewLayer(doc, layerName, layerVisibilityDefaultValue, false);
|
||||
}
|
||||
|
||||
|
||||
private Group addNewLayer(PDFDoc doc, String layerName, boolean layerVisibilityDefaultValue, boolean containsAll) throws PDFNetException {
|
||||
|
||||
Config cfg = getConfig(doc);
|
||||
Group grp = Group.create(doc, layerName);
|
||||
grp.setInitialState(cfg, layerVisibilityDefaultValue);
|
||||
|
||||
return grp;
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
private Optional<Group> findGroupInDoc(PDFDoc doc, String layerName) {
|
||||
|
||||
Obj ocgs = doc.getOCGs();
|
||||
if (ocgs != null) {
|
||||
int i;
|
||||
int sz = (int) ocgs.size();
|
||||
for (i = 0; i < sz; ++i) {
|
||||
Group ocg = new Group(ocgs.getAt(i));
|
||||
if (ocg.getName().equals(layerName)) {
|
||||
return Optional.of(ocg);
|
||||
}
|
||||
}
|
||||
}
|
||||
return Optional.empty();
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
private List<Group> findAllGroupsInDoc(PDFDoc doc) {
|
||||
|
||||
Obj ocgs = doc.getOCGs();
|
||||
|
||||
if (ocgs == null) {
|
||||
return Collections.emptyList();
|
||||
}
|
||||
|
||||
List<Group> groups = new ArrayList<>(Math.toIntExact(ocgs.size()));
|
||||
int i;
|
||||
int sz = (int) ocgs.size();
|
||||
for (i = 0; i < sz; ++i) {
|
||||
Group ocg = new Group(ocgs.getAt(i));
|
||||
groups.add(ocg);
|
||||
}
|
||||
return groups;
|
||||
}
|
||||
|
||||
}
|
||||
@ -0,0 +1,70 @@
|
||||
package com.knecon.fforesight.service.viewerdoc.service;
|
||||
|
||||
import java.io.File;
|
||||
import java.util.Objects;
|
||||
import java.util.Optional;
|
||||
|
||||
import org.apache.pdfbox.Loader;
|
||||
import org.apache.pdfbox.pdmodel.PDDocument;
|
||||
|
||||
import com.pdftron.pdf.PDFDoc;
|
||||
|
||||
import lombok.SneakyThrows;
|
||||
import lombok.experimental.UtilityClass;
|
||||
|
||||
@UtilityClass
|
||||
public class ViewerDocVersioningUtility {
|
||||
|
||||
public static final int currentVersion = 0;
|
||||
public static final String AUTHOR = "knecon technology GmbH";
|
||||
public static final String CUSTOM_DICT = "KNECON_VERSION";
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
public void setVersionInDocument(PDFDoc pdfDoc) {
|
||||
|
||||
pdfDoc.getDocInfo().setAuthor(AUTHOR);
|
||||
pdfDoc.getDocInfo().setKeywords(CUSTOM_DICT + ":" + currentVersion);
|
||||
}
|
||||
|
||||
|
||||
private static Optional<Integer> readVersionFromKeywords(String keywords) {
|
||||
|
||||
String[] strings = keywords.split(":");
|
||||
if (strings.length != 2) {
|
||||
return Optional.empty();
|
||||
}
|
||||
if (!strings[0].equals(CUSTOM_DICT)) {
|
||||
return Optional.empty();
|
||||
}
|
||||
try {
|
||||
return Optional.of(Integer.parseInt(strings[1]));
|
||||
} catch (NumberFormatException e) {
|
||||
return Optional.empty();
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
public boolean isCurrentVersion(File file) {
|
||||
|
||||
try (PDDocument doc = Loader.loadPDF(file)) {
|
||||
return isCurrentVersion(doc.getDocumentInformation().getAuthor(), doc.getDocumentInformation().getKeywords());
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
public boolean docIsCurrentVersion(PDFDoc pdfDoc) {
|
||||
|
||||
return isCurrentVersion(pdfDoc.getDocInfo().getAuthor(), pdfDoc.getDocInfo().getKeywords());
|
||||
}
|
||||
|
||||
|
||||
private static boolean isCurrentVersion(String author, String keywords) {
|
||||
|
||||
return Objects.equals(author, AUTHOR) //
|
||||
&& readVersionFromKeywords(keywords).map(version -> version == currentVersion).orElse(false);
|
||||
}
|
||||
|
||||
}
|
||||
@ -1,324 +0,0 @@
|
||||
package com.knecon.fforesight.service.viewerdoc.service;
|
||||
|
||||
import java.awt.geom.AffineTransform;
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.io.File;
|
||||
import java.io.FileOutputStream;
|
||||
import java.io.IOException;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.nio.file.StandardCopyOption;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Optional;
|
||||
import java.util.Set;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import org.apache.pdfbox.Loader;
|
||||
import org.apache.pdfbox.cos.COSName;
|
||||
import org.apache.pdfbox.pdfwriter.compress.CompressParameters;
|
||||
import org.apache.pdfbox.pdmodel.PDDocument;
|
||||
import org.apache.pdfbox.pdmodel.PDDocumentCatalog;
|
||||
import org.apache.pdfbox.pdmodel.PDPage;
|
||||
import org.apache.pdfbox.pdmodel.PDPageContentStream;
|
||||
import org.apache.pdfbox.pdmodel.PDResources;
|
||||
import org.apache.pdfbox.pdmodel.font.PDFont;
|
||||
import org.apache.pdfbox.pdmodel.graphics.optionalcontent.PDOptionalContentGroup;
|
||||
import org.apache.pdfbox.pdmodel.graphics.optionalcontent.PDOptionalContentProperties;
|
||||
import org.apache.pdfbox.pdmodel.graphics.state.PDExtendedGraphicsState;
|
||||
import org.apache.pdfbox.pdmodel.graphics.state.RenderingMode;
|
||||
import org.apache.pdfbox.util.Matrix;
|
||||
|
||||
import com.knecon.fforesight.service.viewerdoc.ContentStreams;
|
||||
import com.knecon.fforesight.service.viewerdoc.model.ColoredLine;
|
||||
import com.knecon.fforesight.service.viewerdoc.model.ColoredRectangle;
|
||||
import com.knecon.fforesight.service.viewerdoc.model.FilledRectangle;
|
||||
import com.knecon.fforesight.service.viewerdoc.model.PlacedText;
|
||||
import com.knecon.fforesight.service.viewerdoc.model.Visualizations;
|
||||
import com.knecon.fforesight.service.viewerdoc.model.VisualizationsOnPage;
|
||||
import com.knecon.fforesight.service.viewerdoc.pdf.ClassifiedContentStream;
|
||||
|
||||
import io.micrometer.observation.Observation;
|
||||
import io.micrometer.observation.ObservationRegistry;
|
||||
import io.micrometer.observation.annotation.Observed;
|
||||
import lombok.RequiredArgsConstructor;
|
||||
import lombok.SneakyThrows;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
|
||||
@Slf4j
|
||||
@RequiredArgsConstructor
|
||||
public class ViewerDocumentService implements IViewerDocumentService {
|
||||
|
||||
private final ObservationRegistry registry;
|
||||
|
||||
|
||||
@Observed(name = "ViewerDocumentService", contextualName = "add-visualizations")
|
||||
@SneakyThrows
|
||||
public void addVisualizationsOnPage(File originFile, File destinationFile, List<Visualizations> visualizations) {
|
||||
|
||||
// originFile and destinationFile might be the same, so we use a temp file.
|
||||
// Otherwise, saving the document might corrupt the file
|
||||
Path tmpFile = Files.createTempFile("tmpViewerDocument", ".pdf");
|
||||
Files.copy(originFile.toPath(), tmpFile, StandardCopyOption.REPLACE_EXISTING);
|
||||
|
||||
PDDocument pdDocument = openPDDocument(tmpFile.toFile());
|
||||
|
||||
enrichObservation(registry,
|
||||
pdDocument.getNumberOfPages(),
|
||||
visualizations.stream()
|
||||
.map(Visualizations::getLayer)
|
||||
.toList());
|
||||
|
||||
Set<ContentStreams.Identifier> allLayers = visualizations.stream()
|
||||
.map(Visualizations::getLayer)
|
||||
.collect(Collectors.toUnmodifiableSet());
|
||||
|
||||
Map<ContentStreams.Identifier, PDOptionalContentGroup> optionalContentGroupMap = addLayersToDocument(visualizations, pdDocument);
|
||||
|
||||
for (int pageNumber = 0; pageNumber < pdDocument.getNumberOfPages(); pageNumber++) {
|
||||
PDPage pdPage = pdDocument.getPage(pageNumber);
|
||||
createPageResourcesIfNotPresent(pdPage); // needed for optionalContentGroups
|
||||
|
||||
List<ClassifiedContentStream> classifiers = ContentStreamClassifier.getClassifiedContentStreams(pdPage);
|
||||
|
||||
pdPage.setContents(ContentStreamUtility.removeLayerFromContentStreams(allLayers, classifiers));
|
||||
|
||||
AffineTransform textDeRotationMatrix = getTextDeRotationTransform(pdPage);
|
||||
|
||||
if (!ContentStreamClassifier.areAllContentStreamsEscaped(classifiers)) {
|
||||
// We need to save the graphics state before, such that our appended content cannot be affected by previous content streams with side effects,
|
||||
// e.g. not escaped matrix transformations.
|
||||
wrapContentStreams(pdDocument, pdPage);
|
||||
}
|
||||
|
||||
for (Visualizations visualization : visualizations) {
|
||||
if (!visualization.getVisualizationsOnPages().containsKey(pageNumber)) {
|
||||
continue;
|
||||
}
|
||||
// We need to append to the content stream, otherwise the content could be overlapped by following content.
|
||||
try (var contentStream = new PDPageContentStream(pdDocument, pdPage, PDPageContentStream.AppendMode.APPEND, true)) {
|
||||
|
||||
contentStream.beginMarkedContent(visualization.getLayer().cosName());
|
||||
|
||||
if (optionalContentGroupMap.containsKey(visualization.getLayer())) {
|
||||
contentStream.beginMarkedContent(COSName.OC, optionalContentGroupMap.get(visualization.getLayer()));
|
||||
}
|
||||
|
||||
contentStream.saveGraphicsState();
|
||||
|
||||
drawVisualizationsToContentStream(pdDocument,
|
||||
visualization.getVisualizationsOnPages()
|
||||
.get(pageNumber),
|
||||
contentStream,
|
||||
textDeRotationMatrix);
|
||||
|
||||
contentStream.restoreGraphicsState();
|
||||
|
||||
if (optionalContentGroupMap.containsKey(visualization.getLayer())) {
|
||||
contentStream.endMarkedContent();
|
||||
}
|
||||
|
||||
contentStream.endMarkedContent();
|
||||
}
|
||||
|
||||
}
|
||||
if (pageNumber % 500 == 0 && pageNumber != 0) { // re-open document every once in a while to save on RAM
|
||||
log.info("Incremental save after {}/{} pages", pageNumber, pdDocument.getNumberOfPages());
|
||||
observedIncrementalSave(pdDocument, destinationFile);
|
||||
pdDocument.close();
|
||||
Files.copy(destinationFile.toPath(), tmpFile, StandardCopyOption.REPLACE_EXISTING);
|
||||
pdDocument = openPDDocument(tmpFile.toFile());
|
||||
}
|
||||
}
|
||||
observedIncrementalSave(pdDocument, destinationFile);
|
||||
|
||||
pdDocument.close();
|
||||
assert tmpFile.toFile().delete();
|
||||
}
|
||||
|
||||
|
||||
private static Map<ContentStreams.Identifier, PDOptionalContentGroup> addLayersToDocument(List<Visualizations> visualizations, PDDocument pdDocument) {
|
||||
|
||||
Map<ContentStreams.Identifier, PDOptionalContentGroup> optionalContentGroupMap = new HashMap<>();
|
||||
for (Visualizations visualization : visualizations) {
|
||||
addLayerToDocument(visualization.getLayer(), pdDocument, visualization.isLayerVisibilityDefaultValue())//
|
||||
.ifPresent(ocg -> optionalContentGroupMap.put(visualization.getLayer(), ocg));
|
||||
}
|
||||
return optionalContentGroupMap;
|
||||
}
|
||||
|
||||
|
||||
private static void drawVisualizationsToContentStream(PDDocument pdDocument,
|
||||
VisualizationsOnPage visualizationsOnPage,
|
||||
PDPageContentStream contentStream,
|
||||
AffineTransform textDeRotationMatrix) throws IOException {
|
||||
|
||||
if (visualizationsOnPage.isMakePathsInvisible()) {
|
||||
contentStream.addRect(0, 0, 1, 1);
|
||||
contentStream.clip();
|
||||
}
|
||||
|
||||
for (ColoredLine coloredLine : visualizationsOnPage.getColoredLines()) {
|
||||
contentStream.setLineWidth(coloredLine.lineWidth());
|
||||
contentStream.setStrokingColor(coloredLine.color());
|
||||
contentStream.moveTo((float) coloredLine.line().getX1(), (float) coloredLine.line().getY1());
|
||||
contentStream.lineTo((float) coloredLine.line().getX2(), (float) coloredLine.line().getY2());
|
||||
contentStream.stroke();
|
||||
}
|
||||
|
||||
for (ColoredRectangle coloredRectangle : visualizationsOnPage.getColoredRectangles()) {
|
||||
contentStream.setLineWidth(coloredRectangle.lineWidth());
|
||||
contentStream.setStrokingColor(coloredRectangle.color());
|
||||
Rectangle2D r = coloredRectangle.rectangle2D();
|
||||
contentStream.addRect((float) r.getX(), (float) r.getY(), (float) r.getWidth(), (float) r.getHeight());
|
||||
contentStream.stroke();
|
||||
}
|
||||
|
||||
for (FilledRectangle filledRectangle : visualizationsOnPage.getFilledRectangles()) {
|
||||
contentStream.setNonStrokingColor(filledRectangle.color());
|
||||
PDExtendedGraphicsState graphicsState = new PDExtendedGraphicsState();
|
||||
graphicsState.setNonStrokingAlphaConstant(filledRectangle.alpha());
|
||||
contentStream.setGraphicsStateParameters(graphicsState);
|
||||
Rectangle2D r = filledRectangle.rectangle2D();
|
||||
contentStream.addRect((float) r.getX(), (float) r.getY(), (float) r.getWidth(), (float) r.getHeight());
|
||||
contentStream.fill();
|
||||
}
|
||||
|
||||
for (PlacedText placedText : visualizationsOnPage.getPlacedTexts()) {
|
||||
PDFont font = placedText.font().embed(pdDocument);
|
||||
contentStream.setFont(font, placedText.fontSize());
|
||||
contentStream.beginText();
|
||||
contentStream.setNonStrokingColor(placedText.color());
|
||||
if (placedText.renderingMode()
|
||||
.isPresent()) {
|
||||
contentStream.setRenderingMode(placedText.renderingMode()
|
||||
.get());
|
||||
} else {
|
||||
contentStream.setRenderingMode(RenderingMode.FILL);
|
||||
}
|
||||
Matrix textMatrix = getTextMatrix(placedText, textDeRotationMatrix);
|
||||
contentStream.setTextMatrix(textMatrix);
|
||||
contentStream.showText(placedText.text());
|
||||
contentStream.endText();
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
private void enrichObservation(int numberOfPages, List<ContentStreams.Identifier> layers) {
|
||||
|
||||
if (registry == null || registry.getCurrentObservation() == null || registry.isNoop()) {
|
||||
return;
|
||||
}
|
||||
registry.getCurrentObservation().highCardinalityKeyValue("numberOfPages", String.valueOf(numberOfPages));
|
||||
for (int i = 0; i < layers.size(); i++) {
|
||||
ContentStreams.Identifier layer = layers.get(i);
|
||||
|
||||
registry.getCurrentObservation().highCardinalityKeyValue("layer_" + i, String.valueOf(layer.name()));
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
private static void wrapContentStreams(PDDocument pdDocument, PDPage pdPage) throws IOException {
|
||||
|
||||
try (var contentStream = new PDPageContentStream(pdDocument, pdPage, PDPageContentStream.AppendMode.PREPEND, false)) {
|
||||
contentStream.saveGraphicsState();
|
||||
}
|
||||
try (var contentStream = new PDPageContentStream(pdDocument, pdPage, PDPageContentStream.AppendMode.APPEND, false)) {
|
||||
contentStream.restoreGraphicsState();
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
private static Matrix getTextMatrix(PlacedText placedText, AffineTransform textDeRotationMatrix) {
|
||||
|
||||
Matrix textMatrix;
|
||||
if (placedText.textMatrix().isEmpty()) {
|
||||
textMatrix = new Matrix((float) textDeRotationMatrix.getScaleX(),
|
||||
(float) textDeRotationMatrix.getShearX(),
|
||||
(float) textDeRotationMatrix.getShearY(),
|
||||
(float) textDeRotationMatrix.getScaleY(),
|
||||
(float) placedText.lineStart().getX(),
|
||||
(float) placedText.lineStart().getY());
|
||||
} else {
|
||||
textMatrix = placedText.textMatrix()
|
||||
.get();
|
||||
}
|
||||
return textMatrix;
|
||||
}
|
||||
|
||||
|
||||
private static Optional<PDOptionalContentGroup> addLayerToDocument(ContentStreams.Identifier layer, PDDocument pdDocument, boolean layerVisibilityDefaultValue) {
|
||||
|
||||
if (layer.optionalContent()) {
|
||||
return Optional.of(addLayerToDocument(pdDocument, layer.name(), layerVisibilityDefaultValue));
|
||||
|
||||
}
|
||||
return Optional.empty();
|
||||
|
||||
}
|
||||
|
||||
|
||||
private static PDOptionalContentGroup addLayerToDocument(PDDocument pdDocument, String layerName, boolean layerVisibilityDefaultValue) {
|
||||
|
||||
PDDocumentCatalog catalog = pdDocument.getDocumentCatalog();
|
||||
PDOptionalContentProperties ocprops = catalog.getOCProperties();
|
||||
if (ocprops == null) {
|
||||
ocprops = new PDOptionalContentProperties();
|
||||
catalog.setOCProperties(ocprops);
|
||||
}
|
||||
PDOptionalContentGroup layer = null;
|
||||
if (ocprops.hasGroup(layerName)) {
|
||||
layer = ocprops.getGroup(layerName);
|
||||
} else {
|
||||
layer = new PDOptionalContentGroup(layerName);
|
||||
ocprops.addGroup(layer);
|
||||
}
|
||||
ocprops.setGroupEnabled(layer, layerVisibilityDefaultValue);
|
||||
return layer;
|
||||
}
|
||||
|
||||
|
||||
private static PDDocument openPDDocument(File tmpFile) throws IOException {
|
||||
|
||||
PDDocument pdDocument;
|
||||
pdDocument = Loader.loadPDF(tmpFile);
|
||||
pdDocument.setAllSecurityToBeRemoved(true);
|
||||
return pdDocument;
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
private void observedIncrementalSave(PDDocument pdDocument, File outputFile) {
|
||||
|
||||
Observation.createNotStarted("ViewerDocumentService", registry).contextualName("incremental-save").observe(() -> {
|
||||
try (var out = new FileOutputStream(outputFile)) {
|
||||
pdDocument.save(out, CompressParameters.NO_COMPRESSION);
|
||||
} catch (IOException e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
|
||||
private static void createPageResourcesIfNotPresent(PDPage pdPage) {
|
||||
|
||||
PDResources resources = pdPage.getResources();
|
||||
if (resources == null) {
|
||||
resources = new PDResources();
|
||||
pdPage.setResources(resources);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
private static AffineTransform getTextDeRotationTransform(PDPage page) {
|
||||
|
||||
return AffineTransform.getQuadrantRotateInstance(switch (page.getRotation()) {
|
||||
case 90 -> 3;
|
||||
case 180 -> 2;
|
||||
case 270 -> 1;
|
||||
default -> 0;
|
||||
});
|
||||
}
|
||||
|
||||
}
|
||||
@ -1,4 +1,4 @@
|
||||
package com.knecon.fforesight.service.viewerdoc.service.pdftron;
|
||||
package com.knecon.fforesight.service.viewerdoc.service;
|
||||
|
||||
import java.awt.geom.AffineTransform;
|
||||
import java.awt.geom.Line2D;
|
||||
@ -6,7 +6,10 @@ import java.awt.geom.Rectangle2D;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
import com.knecon.fforesight.service.viewerdoc.ContentStreams;
|
||||
import org.apache.pdfbox.pdmodel.graphics.state.RenderingMode;
|
||||
|
||||
import com.knecon.fforesight.service.viewerdoc.LayerIdentifier;
|
||||
import com.knecon.fforesight.service.viewerdoc.layers.LayerGroup;
|
||||
import com.knecon.fforesight.service.viewerdoc.model.ColoredLine;
|
||||
import com.knecon.fforesight.service.viewerdoc.model.ColoredRectangle;
|
||||
import com.knecon.fforesight.service.viewerdoc.model.EmbeddableFont;
|
||||
@ -20,6 +23,7 @@ import com.pdftron.pdf.ColorPt;
|
||||
import com.pdftron.pdf.ColorSpace;
|
||||
import com.pdftron.pdf.Element;
|
||||
import com.pdftron.pdf.ElementBuilder;
|
||||
import com.pdftron.pdf.ElementReader;
|
||||
import com.pdftron.pdf.ElementWriter;
|
||||
import com.pdftron.pdf.Font;
|
||||
import com.pdftron.pdf.GState;
|
||||
@ -37,8 +41,9 @@ public class VisualizationWriter {
|
||||
|
||||
ElementWriter writer;
|
||||
ElementBuilder builder;
|
||||
List<Visualizations> visualizations;
|
||||
Map<ContentStreams.Identifier, Group> groupMap;
|
||||
ElementReader reader;
|
||||
List<LayerGroup> layerGroups;
|
||||
Map<LayerIdentifier, Group> groupMap;
|
||||
Map<EmbeddableFont, Font> fontMap;
|
||||
|
||||
|
||||
@ -48,27 +53,68 @@ public class VisualizationWriter {
|
||||
begin(page);
|
||||
|
||||
AffineTransform textDeRotationMatrix = getTextDeRotationTransform(page);
|
||||
AffineTransform pageTransformation = getTextDeRotationTransform(page);
|
||||
|
||||
for (Visualizations visualization : visualizations) {
|
||||
for (LayerGroup layerGroup : layerGroups) {
|
||||
|
||||
VisualizationsOnPage visualizationsOnPage = visualization.getVisualizationsOnPages()
|
||||
.get(pageNumber);
|
||||
Element markedContentStart = builder.createMarkedContentBeginInlineProperties(layerGroup.getGroupIdentifier().markedContentName());
|
||||
writer.writeElement(markedContentStart);
|
||||
|
||||
if (layerGroup.isOptionalContent()) {
|
||||
Element ocgStart = builder.createMarkedContentBegin("OC", groupMap.get(layerGroup.getGroupIdentifier()).getSDFObj());
|
||||
writer.writeElement(ocgStart);
|
||||
}
|
||||
|
||||
Element escape = builder.createGroupBegin();
|
||||
writer.writeElement(escape);
|
||||
|
||||
writeVisualizations(pageNumber, layerGroup, textDeRotationMatrix);
|
||||
|
||||
Element escapeEnd = builder.createGroupEnd();
|
||||
writer.writeElement(escapeEnd);
|
||||
|
||||
if (layerGroup.isOptionalContent()) {
|
||||
Element ocgEnd2 = builder.createMarkedContentEnd();
|
||||
writer.writeElement(ocgEnd2);
|
||||
}
|
||||
|
||||
Element markedContentEnd = builder.createMarkedContentEnd();
|
||||
writer.writeElement(markedContentEnd);
|
||||
}
|
||||
|
||||
end();
|
||||
|
||||
}
|
||||
|
||||
|
||||
private void writeVisualizations(int pageNumber, LayerGroup layerGroup, AffineTransform textDeRotationMatrix) throws PDFNetException {
|
||||
|
||||
for (Visualizations visualization : layerGroup.getVisualizations()) {
|
||||
|
||||
VisualizationsOnPage visualizationsOnPage = visualization.getVisualizationsOnPages().get(pageNumber);
|
||||
|
||||
if (visualizationsOnPage == null || visualizationsOnPage.isEmpty()) {
|
||||
continue;
|
||||
}
|
||||
|
||||
Element markedContentStart = builder.createMarkedContentBeginInlineProperties(visualization.getLayer().cosName().getName());
|
||||
Element markedContentStart = builder.createMarkedContentBeginInlineProperties(visualization.getLayer().markedContentName());
|
||||
writer.writeElement(markedContentStart);
|
||||
|
||||
if (visualization.getLayer().optionalContent()) {
|
||||
if (layerGroup.subLayersAreOptionalContent()) {
|
||||
Element ocgStart = builder.createMarkedContentBegin("OC", groupMap.get(visualization.getLayer()).getSDFObj());
|
||||
writer.writeElement(ocgStart);
|
||||
}
|
||||
|
||||
Element escape = builder.createGroupBegin();
|
||||
writer.writeElement(escape);
|
||||
|
||||
|
||||
writeVisualization(visualizationsOnPage, textDeRotationMatrix);
|
||||
|
||||
if (visualization.getLayer().optionalContent()) {
|
||||
Element escapeEnd = builder.createGroupEnd();
|
||||
writer.writeElement(escapeEnd);
|
||||
|
||||
if (layerGroup.subLayersAreOptionalContent()) {
|
||||
Element ocgEnd = builder.createMarkedContentEnd();
|
||||
writer.writeElement(ocgEnd);
|
||||
}
|
||||
@ -77,9 +123,6 @@ public class VisualizationWriter {
|
||||
writer.writeElement(markedContentEnd);
|
||||
|
||||
}
|
||||
|
||||
end();
|
||||
|
||||
}
|
||||
|
||||
|
||||
@ -123,6 +166,7 @@ public class VisualizationWriter {
|
||||
|
||||
writePlacedText(textDeRotationMatrix, placedText);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
@ -130,26 +174,46 @@ public class VisualizationWriter {
|
||||
|
||||
float[] rgbComponents = placedText.color().getRGBColorComponents(null);
|
||||
Font font = fontMap.get(placedText.font());
|
||||
|
||||
Element text = builder.createTextRun(placedText.text(), font, placedText.fontSize());
|
||||
|
||||
if (placedText.renderingMode()
|
||||
.isPresent()) {
|
||||
text.getGState()
|
||||
.setRenderingIntent(placedText.renderingMode()
|
||||
.get().intValue());
|
||||
} else {
|
||||
try (ColorPt color = new ColorPt(rgbComponents[0], rgbComponents[1], rgbComponents[2])) {
|
||||
text.getGState().setFillColor(color);
|
||||
}
|
||||
text.getGState().setRenderingIntent(GState.e_fill_text);
|
||||
Element text = builder.createTextBegin(font, placedText.fontSize());
|
||||
text.getGState().setFillColorSpace(ColorSpace.createDeviceRGB());
|
||||
try (ColorPt color = new ColorPt(rgbComponents[0], rgbComponents[1], rgbComponents[2])) {
|
||||
text.getGState().setFillColor(color);
|
||||
}
|
||||
|
||||
try (Matrix2D textMatrix = getTextMatrix(placedText, textDeRotationMatrix)) {
|
||||
text.setTextMatrix(textMatrix);
|
||||
}
|
||||
|
||||
text.getGState()
|
||||
.setTextRenderMode(placedText.renderingMode()
|
||||
.map(VisualizationWriter::resolveTextRenderMode).orElse(GState.e_fill_text));
|
||||
writer.writeElement(text);
|
||||
|
||||
text = switch (font.getType()) {
|
||||
case Font.e_Type0, Font.e_CIDType0, Font.e_TrueType, Font.e_CIDType2 -> builder.createUnicodeTextRun(placedText.text());
|
||||
case Font.e_Type1 -> builder.createTextRun(placedText.text());
|
||||
default -> throw new IllegalStateException("Unexpected value: " + font.getType());
|
||||
};
|
||||
|
||||
writer.writeElement(text);
|
||||
text = builder.createTextEnd();
|
||||
writer.writeElement(text);
|
||||
|
||||
}
|
||||
|
||||
|
||||
private static int resolveTextRenderMode(RenderingMode renderingMode) {
|
||||
|
||||
return switch (renderingMode) {
|
||||
case FILL -> GState.e_fill_text;
|
||||
case STROKE -> GState.e_stroke_text;
|
||||
case FILL_STROKE -> GState.e_fill_stroke_text;
|
||||
case NEITHER -> GState.e_invisible_text;
|
||||
case FILL_CLIP -> GState.e_fill_clip_text;
|
||||
case STROKE_CLIP -> GState.e_stroke_clip_text;
|
||||
case FILL_STROKE_CLIP -> GState.e_fill_stroke_clip_text;
|
||||
case NEITHER_CLIP -> GState.e_clip_text;
|
||||
};
|
||||
}
|
||||
|
||||
|
||||
@ -220,21 +284,32 @@ public class VisualizationWriter {
|
||||
|
||||
Matrix2D textMatrix;
|
||||
if (placedText.textMatrix().isEmpty()) {
|
||||
textMatrix = new Matrix2D(textDeRotationMatrix.getScaleX(),
|
||||
textDeRotationMatrix.getShearX(),
|
||||
textDeRotationMatrix.getShearY(),
|
||||
textDeRotationMatrix.getScaleY(),
|
||||
placedText.lineStart().getX(),
|
||||
placedText.lineStart().getY());
|
||||
textMatrix = toMatrix2D(textDeRotationMatrix, placedText.lineStart().getX(), placedText.lineStart().getY());
|
||||
} else {
|
||||
var matrix = placedText.textMatrix()
|
||||
.get();
|
||||
textMatrix = new Matrix2D(matrix.getScaleX(), matrix.getShearX(), matrix.getShearY(), matrix.getScaleY(), matrix.getTranslateX(), matrix.getTranslateY());
|
||||
var matrix = placedText.textMatrix().get();
|
||||
textMatrix = toMatrix2D(matrix);
|
||||
}
|
||||
return textMatrix;
|
||||
}
|
||||
|
||||
|
||||
private static Matrix2D toMatrix2D(AffineTransform matrix) throws PDFNetException {
|
||||
|
||||
return new Matrix2D(matrix.getScaleX(), matrix.getShearY(), matrix.getShearX(), matrix.getScaleY(), matrix.getTranslateX(), matrix.getTranslateY());
|
||||
}
|
||||
|
||||
|
||||
private static Matrix2D toMatrix2D(AffineTransform textDeRotationMatrix, double translateX, double translateY) throws PDFNetException {
|
||||
|
||||
return new Matrix2D(textDeRotationMatrix.getScaleX(),
|
||||
textDeRotationMatrix.getShearY(),
|
||||
textDeRotationMatrix.getShearX(),
|
||||
textDeRotationMatrix.getScaleY(),
|
||||
translateX,
|
||||
translateY);
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
private static AffineTransform getTextDeRotationTransform(Page page) {
|
||||
|
||||
@ -1,120 +0,0 @@
|
||||
package com.knecon.fforesight.service.viewerdoc.service.pdftron;
|
||||
|
||||
import java.util.Set;
|
||||
|
||||
import com.pdftron.pdf.Element;
|
||||
import com.pdftron.pdf.ElementBuilder;
|
||||
import com.pdftron.pdf.ElementReader;
|
||||
import com.pdftron.pdf.ElementWriter;
|
||||
import com.pdftron.pdf.Page;
|
||||
|
||||
import lombok.AccessLevel;
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Builder;
|
||||
import lombok.SneakyThrows;
|
||||
import lombok.experimental.FieldDefaults;
|
||||
|
||||
@Builder
|
||||
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
|
||||
public class PageContentCleaner {
|
||||
|
||||
ElementWriter writer;
|
||||
ElementReader reader;
|
||||
ElementBuilder elementBuilder;
|
||||
Set<String> markedContentToDraw;
|
||||
Set<String> kneconMarkedContents;
|
||||
|
||||
@Builder.Default
|
||||
MarkedContentStack markedContentStack = new MarkedContentStack();
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
public void cleanPage(Page page) {
|
||||
|
||||
begin(page);
|
||||
boolean escaped = reader.next().getType() == Element.e_group_begin;
|
||||
|
||||
if (!escaped) {
|
||||
writer.writeElement(elementBuilder.createGroupBegin());
|
||||
}
|
||||
|
||||
copyElementsUntilFirstKneconMarkedContent();
|
||||
|
||||
if (!escaped) {
|
||||
writer.writeElement(elementBuilder.createGroupEnd());
|
||||
}
|
||||
|
||||
copyElementsExceptMarkedContentToDraw();
|
||||
end();
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
private void begin(Page page) {
|
||||
|
||||
writer.begin(page, ElementWriter.e_replacement, false, true, page.getResourceDict());
|
||||
reader.begin(page);
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
private void end() {
|
||||
|
||||
writer.end();
|
||||
reader.end();
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
private void copyElementsUntilFirstKneconMarkedContent() {
|
||||
|
||||
for (Element element = reader.current(); element != null; element = reader.next()) {
|
||||
|
||||
switch (element.getType()) {
|
||||
case Element.e_marked_content_begin -> {
|
||||
markedContentStack.enterMarkedContent(element.getMCTag().getName());
|
||||
if (markedContentStack.currentMarkedContentContainsAny(kneconMarkedContents)) {
|
||||
break;
|
||||
}
|
||||
writer.writeElement(element);
|
||||
}
|
||||
case Element.e_marked_content_end -> {
|
||||
markedContentStack.leaveMarkedContent();
|
||||
writer.writeElement(element);
|
||||
}
|
||||
default -> writer.writeElement(element);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
private void copyElementsExceptMarkedContentToDraw() {
|
||||
|
||||
for (Element element = reader.current(); element != null; element = reader.next()) {
|
||||
|
||||
switch (element.getType()) {
|
||||
case Element.e_marked_content_begin -> {
|
||||
markedContentStack.enterMarkedContent(element.getMCTag().getName());
|
||||
if (!markedContentStack.currentMarkedContentContainsAny(markedContentToDraw)) {
|
||||
writer.writeElement(element);
|
||||
}
|
||||
}
|
||||
case Element.e_marked_content_end -> {
|
||||
if (!markedContentStack.currentMarkedContentContainsAny(markedContentToDraw)) {
|
||||
writer.writeElement(element);
|
||||
}
|
||||
markedContentStack.leaveMarkedContent();
|
||||
}
|
||||
default -> {
|
||||
if (!markedContentStack.currentMarkedContentContainsAny(markedContentToDraw)) {
|
||||
writer.writeElement(element);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
@ -1,96 +0,0 @@
|
||||
package com.knecon.fforesight.service.viewerdoc.service.pdftron;
|
||||
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Optional;
|
||||
|
||||
import com.knecon.fforesight.service.viewerdoc.ContentStreams;
|
||||
import com.knecon.fforesight.service.viewerdoc.model.Visualizations;
|
||||
import com.pdftron.common.PDFNetException;
|
||||
import com.pdftron.pdf.PDFDoc;
|
||||
import com.pdftron.pdf.ocg.Config;
|
||||
import com.pdftron.pdf.ocg.Group;
|
||||
import com.pdftron.sdf.Obj;
|
||||
|
||||
import lombok.SneakyThrows;
|
||||
import lombok.experimental.UtilityClass;
|
||||
|
||||
@UtilityClass
|
||||
public class PdftronLayerUtility {
|
||||
|
||||
public Map<ContentStreams.Identifier, Group> addLayersToDocument(List<Visualizations> visualizations, PDFDoc pdfDoc) {
|
||||
|
||||
Map<ContentStreams.Identifier, Group> optionalContentGroupMap = new HashMap<>();
|
||||
for (Visualizations visualization : visualizations) {
|
||||
addLayerToDocument(visualization.getLayer(), pdfDoc, visualization.isLayerVisibilityDefaultValue())//
|
||||
.ifPresent(ocg -> optionalContentGroupMap.put(visualization.getLayer(), ocg));
|
||||
}
|
||||
return optionalContentGroupMap;
|
||||
}
|
||||
|
||||
|
||||
private Optional<Group> addLayerToDocument(ContentStreams.Identifier layer, PDFDoc pdfDoc, boolean layerVisibilityDefaultValue) {
|
||||
|
||||
if (layer.optionalContent()) {
|
||||
return Optional.of(addLayerToDocument(pdfDoc, layer.name(), layerVisibilityDefaultValue));
|
||||
|
||||
}
|
||||
return Optional.empty();
|
||||
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
private Group addLayerToDocument(PDFDoc doc, String layerName, boolean layerVisibilityDefaultValue) {
|
||||
|
||||
Optional<Group> existingGroup = findGroupInDoc(doc, layerName);
|
||||
|
||||
if (existingGroup.isPresent()) {
|
||||
return existingGroup.get();
|
||||
}
|
||||
|
||||
return addNewLayer(doc, layerName, layerVisibilityDefaultValue);
|
||||
}
|
||||
|
||||
|
||||
private Group addNewLayer(PDFDoc doc, String layerName, boolean layerVisibilityDefaultValue) throws PDFNetException {
|
||||
|
||||
Config cfg = doc.getOCGConfig();
|
||||
if (cfg == null) {
|
||||
cfg = Config.create(doc, true);
|
||||
cfg.setName("Default");
|
||||
}
|
||||
Group grp = Group.create(doc, layerName);
|
||||
grp.setInitialState(cfg, layerVisibilityDefaultValue);
|
||||
|
||||
// Add the new OCG to the list of layers that should appear in PDF viewer GUI.
|
||||
Obj layerOrderArray = cfg.getOrder();
|
||||
if (layerOrderArray == null) {
|
||||
layerOrderArray = doc.createIndirectArray();
|
||||
cfg.setOrder(layerOrderArray);
|
||||
}
|
||||
layerOrderArray.pushBack(grp.getSDFObj());
|
||||
|
||||
return grp;
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
private Optional<Group> findGroupInDoc(PDFDoc doc, String layerName) {
|
||||
|
||||
Obj ocgs = doc.getOCGs();
|
||||
if (ocgs != null) {
|
||||
int i;
|
||||
int sz = (int) ocgs.size();
|
||||
for (i = 0; i < sz; ++i) {
|
||||
Group ocg = new Group(ocgs.getAt(i));
|
||||
if (ocg.getName().equals(layerName)) {
|
||||
return Optional.of(ocg);
|
||||
}
|
||||
}
|
||||
}
|
||||
return Optional.empty();
|
||||
}
|
||||
|
||||
}
|
||||
@ -1,124 +0,0 @@
|
||||
package com.knecon.fforesight.service.viewerdoc.service;
|
||||
|
||||
import static org.junit.jupiter.api.Assertions.assertEquals;
|
||||
import static org.junit.jupiter.api.Assertions.assertTrue;
|
||||
|
||||
import java.io.File;
|
||||
import java.nio.file.Files;
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import org.apache.pdfbox.Loader;
|
||||
import org.apache.pdfbox.cos.COSName;
|
||||
import org.apache.pdfbox.pdmodel.PDDocument;
|
||||
import org.apache.pdfbox.pdmodel.PDPage;
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
import com.knecon.fforesight.service.viewerdoc.ContentStreams;
|
||||
import com.knecon.fforesight.service.viewerdoc.pdf.ClassifiedContentStream;
|
||||
|
||||
import lombok.SneakyThrows;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
|
||||
@Slf4j
|
||||
public class ContentStreamClassifierTest {
|
||||
|
||||
@Test
|
||||
@SneakyThrows
|
||||
public void testClassification() {
|
||||
|
||||
File pdfFile = new File(Thread.currentThread().getContextClassLoader().getResource("viewerDocLayers.pdf").getFile());
|
||||
|
||||
try (PDDocument document = Loader.loadPDF(pdfFile)) {
|
||||
|
||||
PDPage page = document.getPage(0);
|
||||
|
||||
List<ClassifiedContentStream> classifieds = ContentStreamClassifier.getClassifiedContentStreams(page);
|
||||
|
||||
logContentStreamClassifications(classifieds);
|
||||
|
||||
assertEquals(11, classifieds.size());
|
||||
assertEquals(ContentStreams.ESCAPE_START, classifieds.get(0).classification());
|
||||
for (int i = 1; i < 9; i++) {
|
||||
assertEquals(ContentStreams.OTHER, classifieds.get(i).classification());
|
||||
}
|
||||
assertEquals(ContentStreams.ESCAPE_END, classifieds.get(9).classification());
|
||||
assertEquals(ContentStreams.KNECON_LAYOUT, classifieds.get(10).classification());
|
||||
assertTrue(ContentStreamClassifier.areAllContentStreamsEscaped(classifieds));
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
@SneakyThrows
|
||||
public void testRemoveLayoutLayer() {
|
||||
|
||||
File pdfFile = new File(Thread.currentThread().getContextClassLoader().getResource("viewerDocLayers.pdf").getFile());
|
||||
File tmpFile = Files.createTempFile("removedLayout", ".pdf").toFile();
|
||||
|
||||
try (PDDocument document = Loader.loadPDF(pdfFile)) {
|
||||
|
||||
PDPage page = document.getPage(0);
|
||||
|
||||
List<ClassifiedContentStream> classifieds = ContentStreamClassifier.getClassifiedContentStreams(page);
|
||||
page.setContents(ContentStreamUtility.removeLayerFromContentStreams(Set.of(ContentStreams.KNECON_LAYOUT), classifieds));
|
||||
|
||||
document.save(tmpFile);
|
||||
}
|
||||
try (PDDocument document2 = Loader.loadPDF(tmpFile)) {
|
||||
|
||||
PDPage page2 = document2.getPage(0);
|
||||
|
||||
List<ClassifiedContentStream> classifieds2 = ContentStreamClassifier.getClassifiedContentStreams(page2);
|
||||
|
||||
logContentStreamClassifications(classifieds2);
|
||||
|
||||
assertEquals(10, classifieds2.size());
|
||||
assertEquals(ContentStreams.ESCAPE_START, classifieds2.get(0).classification());
|
||||
for (int i = 1; i < 9; i++) {
|
||||
assertEquals(ContentStreams.OTHER, classifieds2.get(i).classification());
|
||||
}
|
||||
assertEquals(ContentStreams.ESCAPE_END, classifieds2.get(9).classification());
|
||||
assertTrue(ContentStreamClassifier.areAllContentStreamsEscaped(classifieds2));
|
||||
}
|
||||
assert tmpFile.delete();
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
@SneakyThrows
|
||||
public void testClassificationForOldLayers() {
|
||||
|
||||
File pdfFile = new File(Thread.currentThread().getContextClassLoader().getResource("oldViewerDocLayers.pdf").getFile());
|
||||
try (PDDocument document = Loader.loadPDF(pdfFile)) {
|
||||
|
||||
PDPage page = document.getPage(0);
|
||||
|
||||
List<ClassifiedContentStream> classifieds = ContentStreamClassifier.getClassifiedContentStreams(page);
|
||||
|
||||
logContentStreamClassifications(classifieds);
|
||||
|
||||
assertEquals(11, classifieds.size());
|
||||
assertEquals(ContentStreams.ESCAPE_START, classifieds.get(0).classification());
|
||||
for (int i = 1; i < 9; i++) {
|
||||
assertEquals(ContentStreams.OTHER, classifieds.get(i).classification());
|
||||
}
|
||||
assertEquals(ContentStreams.ESCAPE_END, classifieds.get(9).classification());
|
||||
assertEquals(ContentStreams.KNECON_LAYOUT, classifieds.get(10).classification());
|
||||
assertTrue(ContentStreamClassifier.areAllContentStreamsEscaped(classifieds));
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
private static void logContentStreamClassifications(List<ClassifiedContentStream> classifieds) {
|
||||
|
||||
log.info("number of content streams: {}", classifieds.size());
|
||||
log.info("Classifications: {}", classifieds.stream()//
|
||||
.map(ClassifiedContentStream::classification)//
|
||||
.map(ContentStreams.Identifier::cosName)//
|
||||
.map(COSName::getName)//
|
||||
.collect(Collectors.joining(", ")));
|
||||
}
|
||||
|
||||
}
|
||||
@ -0,0 +1,76 @@
|
||||
package com.knecon.fforesight.service.viewerdoc.service;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.FileInputStream;
|
||||
import java.io.FileOutputStream;
|
||||
import java.nio.file.Path;
|
||||
import java.util.Set;
|
||||
|
||||
import org.junit.jupiter.api.AfterAll;
|
||||
import org.junit.jupiter.api.BeforeEach;
|
||||
import org.junit.jupiter.api.Disabled;
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
import com.knecon.fforesight.service.viewerdoc.LayerIdentifier;
|
||||
import com.pdftron.pdf.ElementBuilder;
|
||||
import com.pdftron.pdf.ElementReader;
|
||||
import com.pdftron.pdf.ElementWriter;
|
||||
import com.pdftron.pdf.PDFDoc;
|
||||
import com.pdftron.pdf.PDFNet;
|
||||
import com.pdftron.pdf.Page;
|
||||
import com.pdftron.pdf.PageIterator;
|
||||
import com.pdftron.sdf.SDFDoc;
|
||||
|
||||
import lombok.SneakyThrows;
|
||||
|
||||
@Disabled
|
||||
class PageContentCleanerTest {
|
||||
|
||||
@BeforeEach
|
||||
public void init() {
|
||||
|
||||
PDFNet.initialize("demo:1650351709282:7bd235e003000000004ec28a6743e1163a085e2115de2536ab6e2cfe5a");
|
||||
}
|
||||
|
||||
|
||||
@AfterAll
|
||||
public static void cleanup() {
|
||||
|
||||
PDFNet.terminate();
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
@SneakyThrows
|
||||
public void testContentCleaning() {
|
||||
|
||||
Path file = Path.of("/tmp/OCR_TEST/402Study.pdf/viewerDocument.pdf");
|
||||
File tmpFile = new File("/tmp/cleaned.pdf");
|
||||
try (var in = new FileInputStream(file.toFile());//
|
||||
var doc = new PDFDoc(in);//
|
||||
var out = new FileOutputStream(tmpFile);//
|
||||
ElementWriter pageWriter = new ElementWriter();//
|
||||
ElementReader reader = new ElementReader();//
|
||||
ElementBuilder builder = new ElementBuilder()//
|
||||
) {
|
||||
|
||||
PageContentCleaner pageContentCleaner = PageContentCleaner.builder()
|
||||
.writer(pageWriter)
|
||||
.reader(reader)
|
||||
.elementBuilder(builder)
|
||||
.markedContentToRemove(Set.of(LayerIdentifier.KNECON_OCR_DEBUG.markedContentName()))
|
||||
.build();
|
||||
|
||||
for (PageIterator iterator = doc.getPageIterator(); iterator.hasNext(); ) {
|
||||
|
||||
Page page = iterator.next();
|
||||
|
||||
pageContentCleaner.removeMarkedContent(page);
|
||||
}
|
||||
|
||||
doc.save(out, SDFDoc.SaveMode.REMOVE_UNUSED, null);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
@ -0,0 +1,58 @@
|
||||
package com.knecon.fforesight.service.viewerdoc.service;
|
||||
|
||||
import java.awt.geom.AffineTransform;
|
||||
import java.io.File;
|
||||
import java.io.FileInputStream;
|
||||
import java.io.FileOutputStream;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
|
||||
import javax.swing.table.AbstractTableModel;
|
||||
|
||||
import org.junit.jupiter.api.AfterAll;
|
||||
import org.junit.jupiter.api.BeforeEach;
|
||||
import org.junit.jupiter.api.Test;
|
||||
import org.springframework.core.io.ClassPathResource;
|
||||
|
||||
import com.pdftron.common.Matrix2D;
|
||||
import com.pdftron.pdf.ElementBuilder;
|
||||
import com.pdftron.pdf.ElementReader;
|
||||
import com.pdftron.pdf.ElementWriter;
|
||||
import com.pdftron.pdf.PDFDoc;
|
||||
import com.pdftron.pdf.PDFNet;
|
||||
import com.pdftron.pdf.Page;
|
||||
import com.pdftron.sdf.SDFDoc;
|
||||
|
||||
import lombok.SneakyThrows;
|
||||
|
||||
class ViewerDocVersioningUtilityTest {
|
||||
|
||||
@BeforeEach
|
||||
public void init() {
|
||||
|
||||
PDFNet.initialize("demo:1650351709282:7bd235e003000000004ec28a6743e1163a085e2115de2536ab6e2cfe5a");
|
||||
}
|
||||
|
||||
|
||||
@AfterAll
|
||||
public static void cleanup() {
|
||||
|
||||
PDFNet.terminate();
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
@SneakyThrows
|
||||
public void testMarking() {
|
||||
|
||||
File file = new ClassPathResource("files/empty.pdf").getFile();
|
||||
Path tmpFile = Files.createTempFile("markedDocument", ".pdf");
|
||||
try (var in = new FileInputStream(file); var doc = new PDFDoc(in); var out = new FileOutputStream(tmpFile.toFile())) {
|
||||
ViewerDocVersioningUtility.setVersionInDocument(doc);
|
||||
doc.save(out, SDFDoc.SaveMode.LINEARIZED, null);
|
||||
}
|
||||
assert ViewerDocVersioningUtility.isCurrentVersion(tmpFile.toFile());
|
||||
assert tmpFile.toFile().delete();
|
||||
}
|
||||
|
||||
}
|
||||
BIN
layoutparser-service/viewer-doc-processor/src/test/resources/files/empty.pdf
(Stored with Git LFS)
Normal file
BIN
layoutparser-service/viewer-doc-processor/src/test/resources/files/empty.pdf
(Stored with Git LFS)
Normal file
Binary file not shown.
Binary file not shown.
Binary file not shown.
Loading…
x
Reference in New Issue
Block a user