diff --git a/layoutparser-service/layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api/queue/LayoutParsingRequest.java b/layoutparser-service/layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api/queue/LayoutParsingRequest.java index f39572f..aad4610 100644 --- a/layoutparser-service/layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api/queue/LayoutParsingRequest.java +++ b/layoutparser-service/layoutparser-service-internal-api/src/main/java/com/knecon/fforesight/service/layoutparser/internal/api/queue/LayoutParsingRequest.java @@ -10,36 +10,25 @@ import lombok.NonNull; @Builder @Schema(description = "Object containing all storage paths the service needs to know.") public record LayoutParsingRequest( - @Schema(description = "Enum specifying the type of layout parsing to be performed.", allowableValues = "{RedactManager, DocuMine, TAAS}")// @NonNull LayoutParsingType layoutParsingType, - @Schema(description = "General purpose identifiers. They are not changed by the service at all and are returned as is in the response queue.")// Map identifier, - @Schema(description = "Path to the original PDF file.")// - @NonNull String originFileStorageId,// + @NonNull String originFileStorageId, + Optional tablesFileStorageId, + Optional imagesFileStorageId, - @Schema(description = "Optional Path to the table extraction file.")// - Optional tablesFileStorageId,// - @Schema(description = "Optional Path to the image classification file.")// - Optional imagesFileStorageId,// + Optional visualLayoutParsingFileId, - @Schema(description = "Optional Path to the the visual layout parsing service file") Optional visualLayoutParsingFileId,// - - @Schema(description = "Path where the Document Structure File will be stored.")// - @NonNull String structureFileStorageId,// - @Schema(description = "Path where the Research Data File will be stored.")// - String researchDocumentStorageId,// - @Schema(description = "Path where the Document Text File will be stored.")// - @NonNull String textBlockFileStorageId,// - @Schema(description = "Path where the Document Positions File will be stored.")// - @NonNull String positionBlockFileStorageId,// - @Schema(description = "Path where the Document Pages File will be stored.")// - @NonNull String pageFileStorageId,// - @Schema(description = "Path where the Simplified Text File will be stored.")// - @NonNull String simplifiedTextStorageId,// - @Schema(description = "Path where the Viewer Document PDF will be stored.")// - @NonNull String viewerDocumentStorageId) { + @NonNull String structureFileStorageId, + String researchDocumentStorageId, + String markdownDocumentStorageId, + @NonNull String textBlockFileStorageId, + @NonNull String positionBlockFileStorageId, + @NonNull String pageFileStorageId, + @NonNull String simplifiedTextStorageId, + @NonNull String viewerDocumentStorageId +) { } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingPipeline.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingPipeline.java index 92d15ba..f264eb8 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingPipeline.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingPipeline.java @@ -4,6 +4,7 @@ import static java.lang.String.format; import java.awt.geom.Point2D; import java.awt.geom.Rectangle2D; +import java.io.ByteArrayInputStream; import java.io.File; import java.io.IOException; import java.nio.file.Files; @@ -18,12 +19,15 @@ import org.apache.pdfbox.pdmodel.PDDocument; import org.apache.pdfbox.pdmodel.PDPage; import org.apache.pdfbox.pdmodel.common.PDRectangle; import org.apache.pdfbox.pdmodel.documentinterchange.markedcontent.PDMarkedContent; +import org.commonmark.ext.gfm.tables.TablesExtension; +import org.commonmark.renderer.markdown.MarkdownRenderer; import org.springframework.stereotype.Service; import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeType; import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingFinishedEvent; import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingRequest; import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType; +import com.knecon.fforesight.service.layoutparser.processor.markdown.DocumentDataParser; import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock; import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationDocument; import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage; @@ -120,24 +124,18 @@ public class LayoutParsingPipeline { File viewerDocumentFile = originFile; VisualLayoutParsingResponse visualLayoutParsingResponse = new VisualLayoutParsingResponse(); - if (layoutParsingRequest.visualLayoutParsingFileId() - .isPresent()) { - visualLayoutParsingResponse = layoutParsingStorageService.getVisualLayoutParsingFile(layoutParsingRequest.visualLayoutParsingFileId() - .get()); + if (layoutParsingRequest.visualLayoutParsingFileId().isPresent()) { + visualLayoutParsingResponse = layoutParsingStorageService.getVisualLayoutParsingFile(layoutParsingRequest.visualLayoutParsingFileId().get()); } ImageServiceResponse imageServiceResponse = new ImageServiceResponse(); - if (layoutParsingRequest.imagesFileStorageId() - .isPresent()) { - imageServiceResponse = layoutParsingStorageService.getImagesFile(layoutParsingRequest.imagesFileStorageId() - .get()); + if (layoutParsingRequest.imagesFileStorageId().isPresent()) { + imageServiceResponse = layoutParsingStorageService.getImagesFile(layoutParsingRequest.imagesFileStorageId().get()); } TableServiceResponse tableServiceResponse = new TableServiceResponse(); - if (layoutParsingRequest.tablesFileStorageId() - .isPresent()) { - tableServiceResponse = layoutParsingStorageService.getTablesFile(layoutParsingRequest.tablesFileStorageId() - .get()); + if (layoutParsingRequest.tablesFileStorageId().isPresent()) { + tableServiceResponse = layoutParsingStorageService.getTablesFile(layoutParsingRequest.tablesFileStorageId().get()); } ClassificationDocument classificationDocument = parseLayout(settings.getLayoutParsingTypeOverride() == null // @@ -155,12 +153,7 @@ public class LayoutParsingPipeline { log.info("Creating viewer document for {}", layoutParsingRequest.identifier()); - layoutGridService.addLayoutGrid(viewerDocumentFile, - documentGraph, - viewerDocumentFile, - false, - layoutParsingRequest.visualLayoutParsingFileId() - .isPresent()); + layoutGridService.addLayoutGrid(viewerDocumentFile, documentGraph, viewerDocumentFile, false, layoutParsingRequest.visualLayoutParsingFileId().isPresent()); log.info("Storing resulting files for {}", layoutParsingRequest.identifier()); @@ -174,6 +167,16 @@ public class LayoutParsingPipeline { layoutParsingStorageService.storeResearchDocumentData(layoutParsingRequest, researchDocumentData); } + if (layoutParsingRequest.markdownDocumentStorageId() != null) { + log.info("Rendering document data as markdown for {}", layoutParsingRequest.identifier()); + var markdownDocument = DocumentDataParser.parse(documentGraph.streamAllSubNodes()); + MarkdownRenderer renderer = MarkdownRenderer.builder().extensions(List.of(TablesExtension.create())).build(); + String markdown = renderer.render(markdownDocument); + try (var in = new ByteArrayInputStream(markdown.getBytes())) { + layoutParsingStorageService.storeObject(layoutParsingRequest.markdownDocumentStorageId(), in); + } + } + if (!viewerDocumentFile.equals(originFile)) { viewerDocumentFile.delete(); } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingStorageService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingStorageService.java index 471db6a..bcd23cf 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingStorageService.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingStorageService.java @@ -102,6 +102,11 @@ public class LayoutParsingStorageService { storageService.storeJSONObject(TenantContext.getTenantId(), layoutParsingRequest.researchDocumentStorageId(), researchDocumentData); } + public void storeObject(String storageId, InputStream in) { + + storageService.storeObject(TenantContext.getTenantId(), storageId, in); + } + private File createTempFile(String filenamePrefix, String filenameSuffix) throws IOException { diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/markdown/DocumentDataParser.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/markdown/DocumentDataParser.java index e06f371..3494b3d 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/markdown/DocumentDataParser.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/markdown/DocumentDataParser.java @@ -2,6 +2,7 @@ package com.knecon.fforesight.service.layoutparser.processor.markdown; import java.util.ArrayList; import java.util.HashSet; +import java.util.LinkedList; import java.util.List; import java.util.Map; import java.util.Objects; @@ -16,8 +17,10 @@ import org.commonmark.ext.gfm.tables.TableHead; import org.commonmark.ext.gfm.tables.TableRow; import org.commonmark.node.Document; import org.commonmark.node.Emphasis; +import org.commonmark.node.HardLineBreak; import org.commonmark.node.Heading; import org.commonmark.node.Node; +import org.commonmark.node.SoftLineBreak; import org.commonmark.node.StrongEmphasis; import org.commonmark.node.Text; @@ -88,14 +91,7 @@ public class DocumentDataParser { private Node createTableCell(com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.TableCell tc) { var cell = new TableCell(); - if (tc.isLeaf()) { - parseTextBlock(tc.getTextBlock()).forEach(cell::appendChild); - } else { - tc.streamChildren() - .map(DocumentDataParser::parseNode) - .filter(Objects::nonNull) - .forEach(cell::appendChild); - } + parseTextBlock(tc.getTextBlock()).forEach(cell::appendChild); return cell; } @@ -103,7 +99,7 @@ public class DocumentDataParser { private org.commonmark.node.Paragraph parseParagraph(Paragraph paragraph) { org.commonmark.node.Paragraph heading = new org.commonmark.node.Paragraph(); - parseTextBlock(paragraph.getTextBlock()).forEach(heading::appendChild); + parseTextBlockWithLineBreaks(paragraph.getTextBlock()).forEach(heading::appendChild); return heading; } @@ -112,11 +108,56 @@ public class DocumentDataParser { Heading heading = new Heading(); heading.setLevel(headline.getTreeId().size()); - parseTextBlock(headline.getTextBlock()).forEach(heading::appendChild); + parseTextBlockWithLineBreaks(headline.getTextBlock()).forEach(heading::appendChild); return heading; } + private List parseTextBlockWithLineBreaks(TextBlock textBlock) { + + LinkedList result = new LinkedList<>(); + List textRanges = mergeTextStyles(textBlock); + for (TextRangeWithTextType textRange : textRanges) { + if (textBlock.subSequenceWithLineBreaks(textRange.textRange()).equals("\n")) { + result.add(new HardLineBreak()); + } + String text = textBlock.subSequenceWithLineBreaks(textRange.textRange()); + String[] lines = text.split("\n"); + for (String line : lines) { + String cleanedLine = line.trim(); + if (cleanedLine.isEmpty()) { + result.add(new HardLineBreak()); + continue; + } + switch (textRange.fontStyle()) { + case REGULAR -> result.add(new Text(cleanedLine)); + case BOLD -> { + StrongEmphasis boldBlock = new StrongEmphasis(); + boldBlock.appendChild(new Text(cleanedLine)); + result.add(boldBlock); + } + case ITALIC -> { + Emphasis italicBlock = new Emphasis("_"); + italicBlock.appendChild(new Text(cleanedLine)); + result.add(italicBlock); + } + case BOLD_ITALIC -> { + Emphasis italicBlock = new Emphasis("_"); + + StrongEmphasis boldBlock = new StrongEmphasis(); + boldBlock.appendChild(new Text(cleanedLine)); + + italicBlock.appendChild(boldBlock); + result.add(italicBlock); + } + } + result.add(new HardLineBreak()); + } + } + result.removeLast(); + return result; + } + private List parseTextBlock(TextBlock textBlock) { @@ -124,22 +165,22 @@ public class DocumentDataParser { List textRanges = mergeTextStyles(textBlock); for (TextRangeWithTextType textRange : textRanges) { switch (textRange.fontStyle()) { - case REGULAR -> result.add(new Text(textBlock.subSequenceWithLineBreaks(textRange.textRange()))); + case REGULAR -> result.add(new Text(textBlock.subSequence(textRange.textRange()).toString())); case BOLD -> { StrongEmphasis boldBlock = new StrongEmphasis(); - boldBlock.appendChild(new Text(textBlock.subSequenceWithLineBreaks(textRange.textRange()))); + boldBlock.appendChild(new Text(textBlock.subSequence(textRange.textRange()).toString())); result.add(boldBlock); } case ITALIC -> { - Emphasis italicBlock = new Emphasis(); - italicBlock.appendChild(new Text(textBlock.subSequenceWithLineBreaks(textRange.textRange()))); + Emphasis italicBlock = new Emphasis("_"); + italicBlock.appendChild(new Text(textBlock.subSequence(textRange.textRange()).toString())); result.add(italicBlock); } case BOLD_ITALIC -> { - Emphasis italicBlock = new Emphasis(); + Emphasis italicBlock = new Emphasis("_"); StrongEmphasis boldBlock = new StrongEmphasis(); - boldBlock.appendChild(new Text(textBlock.subSequenceWithLineBreaks(textRange.textRange()))); + boldBlock.appendChild(new Text(textBlock.subSequence(textRange.textRange()).toString())); italicBlock.appendChild(boldBlock); result.add(italicBlock); @@ -154,38 +195,49 @@ public class DocumentDataParser { List result = new ArrayList<>(); - TreeMap> styleChanges = new TreeMap<>(); + TreeMap> styleChanges = new TreeMap<>(); + + int start = textBlock.getTextRange().start(); + int end = textBlock.getTextRange().end(); for (TextRange bold : textBlock.getBoldTextBoundaries()) { - styleChanges.computeIfAbsent(bold.start(), k -> new HashSet<>()).add(FontStyle.BOLD); - styleChanges.computeIfAbsent(bold.end(), k -> new HashSet<>()).add(FontStyle.REGULAR); + styleChanges.computeIfAbsent(bold.start() + start, k -> new HashSet<>()).add(FontStyleChange.enter(FontStyle.BOLD)); + styleChanges.computeIfAbsent(bold.end() + start, k -> new HashSet<>()).add(FontStyleChange.leave(FontStyle.BOLD)); } for (TextRange italic : textBlock.getItalicTextBoundaries()) { - styleChanges.computeIfAbsent(italic.start(), k -> new HashSet<>()).add(FontStyle.ITALIC); - styleChanges.computeIfAbsent(italic.end(), k -> new HashSet<>()).add(FontStyle.REGULAR); + styleChanges.computeIfAbsent(italic.start() + start, k -> new HashSet<>()).add(FontStyleChange.enter(FontStyle.ITALIC)); + styleChanges.computeIfAbsent(italic.end() + start, k -> new HashSet<>()).add(FontStyleChange.leave(FontStyle.ITALIC)); } if (styleChanges.isEmpty()) { - result.add(new TextRangeWithTextType(new TextRange(0, textBlock.length()), FontStyle.REGULAR)); + result.add(new TextRangeWithTextType(new TextRange(start, end), FontStyle.REGULAR)); return result; } - int start = 0; Set currentStyles = new HashSet<>(); currentStyles.add(FontStyle.REGULAR); - for (Map.Entry> entry : styleChanges.entrySet()) { + for (Map.Entry> entry : styleChanges.entrySet()) { int point = entry.getKey(); - Set changes = entry.getValue(); + Set changes = entry.getValue(); if (point > start) { FontStyle style = determineFontStyle(currentStyles); result.add(new TextRangeWithTextType(new TextRange(start, point), style)); } - currentStyles.removeAll(changes); - currentStyles.addAll(changes); + changes.stream() + .filter(FontStyleChange::leave) + .map(FontStyleChange::style) + .toList() + .forEach(currentStyles::remove); + + currentStyles.addAll(changes.stream() + .filter(FontStyleChange::enter) + .map(FontStyleChange::style) + .toList()); + if (currentStyles.isEmpty()) { currentStyles.add(FontStyle.REGULAR); } @@ -193,12 +245,14 @@ public class DocumentDataParser { start = point; } - if (start < textBlock.length()) { + if (start < end) { FontStyle style = determineFontStyle(currentStyles); - result.add(new TextRangeWithTextType(new TextRange(start, textBlock.length()), style)); + result.add(new TextRangeWithTextType(new TextRange(start, textBlock.getTextRange().end()), style)); } - return result; + return result.stream() + .filter(t -> t.textRange.length() > 1) + .toList(); } @@ -223,6 +277,27 @@ public class DocumentDataParser { BOLD_ITALIC; } + record FontStyleChange(boolean enter, FontStyle style) { + + public static FontStyleChange enter(FontStyle style) { + + return new FontStyleChange(true, style); + } + + + public static FontStyleChange leave(FontStyle style) { + + return new FontStyleChange(false, style); + } + + + public boolean leave() { + + return !enter; + } + + } + record TextRangeWithTextType(TextRange textRange, FontStyle fontStyle) { } diff --git a/layoutparser-service/layoutparser-service-server/src/main/java/com/knecon/fforesight/service/layoutparser/server/queue/MessageHandler.java b/layoutparser-service/layoutparser-service-server/src/main/java/com/knecon/fforesight/service/layoutparser/server/queue/MessageHandler.java index b10c15e..1473407 100644 --- a/layoutparser-service/layoutparser-service-server/src/main/java/com/knecon/fforesight/service/layoutparser/server/queue/MessageHandler.java +++ b/layoutparser-service/layoutparser-service-server/src/main/java/com/knecon/fforesight/service/layoutparser/server/queue/MessageHandler.java @@ -35,7 +35,6 @@ public class MessageHandler { private final ObjectMapper objectMapper; private final RabbitTemplate rabbitTemplate; private final static String X_PIPELINE_PREFIX = "X-PIPE-"; - private final LogFileWebEndpoint logFileWebEndpoint; @RabbitHandler diff --git a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/LayoutparserEnd2EndTest.java b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/LayoutparserEnd2EndTest.java index 75e62c9..e0fa9a4 100644 --- a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/LayoutparserEnd2EndTest.java +++ b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/LayoutparserEnd2EndTest.java @@ -34,7 +34,7 @@ public class LayoutparserEnd2EndTest extends AbstractTest { @Test public void testLayoutParserEndToEnd() { - String filePath = "files/Minimal Examples/RotateTextWithRulingsTestFile.pdf"; + String filePath = "files/new/crafted document.pdf"; runForFile(filePath); } @@ -79,9 +79,11 @@ public class LayoutparserEnd2EndTest extends AbstractTest { .forEach(log::info); File tmpFile = new File("/tmp/layoutparserEND2END/" + fileName + "_VIEWER.pdf"); + File markdownTmpFile = new File("/tmp/layoutparserEND2END/" + fileName + ".md"); assert tmpFile.getParentFile().exists() || tmpFile.getParentFile().mkdirs(); storageService.downloadTo(TENANT_ID, layoutParsingRequest.viewerDocumentStorageId(), tmpFile); + storageService.downloadTo(TENANT_ID, layoutParsingRequest.markdownDocumentStorageId(), markdownTmpFile); } diff --git a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/utils/AbstractTest.java b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/utils/AbstractTest.java index cbd7b6d..bf67465 100644 --- a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/utils/AbstractTest.java +++ b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/utils/AbstractTest.java @@ -41,6 +41,7 @@ import lombok.SneakyThrows; @Import(AbstractTest.TestConfiguration.class) public abstract class AbstractTest { + public static final String MARKDOWN_FILE_ID = "markdown"; @Autowired protected LayoutParsingStorageService layoutParsingStorageService; @@ -121,6 +122,7 @@ public abstract class AbstractTest { .pageFileStorageId(fileName + PAGES_FILE_ID) .simplifiedTextStorageId(fileName + SIMPLIFIED_ID) .viewerDocumentStorageId(fileName + VIEWER_DOCUMENT_ID) + .markdownDocumentStorageId(fileName + MARKDOWN_FILE_ID) .build(); } diff --git a/layoutparser-service/viewer-doc-processor/src/test/resources/oldViewerDocLayers.pdf b/layoutparser-service/viewer-doc-processor/src/test/resources/oldViewerDocLayers.pdf index 9b3f010..967a529 100644 Binary files a/layoutparser-service/viewer-doc-processor/src/test/resources/oldViewerDocLayers.pdf and b/layoutparser-service/viewer-doc-processor/src/test/resources/oldViewerDocLayers.pdf differ diff --git a/layoutparser-service/viewer-doc-processor/src/test/resources/viewerDocLayers.pdf b/layoutparser-service/viewer-doc-processor/src/test/resources/viewerDocLayers.pdf index 8848184..8904b04 100644 Binary files a/layoutparser-service/viewer-doc-processor/src/test/resources/viewerDocLayers.pdf and b/layoutparser-service/viewer-doc-processor/src/test/resources/viewerDocLayers.pdf differ