CLARI-002: fix some stuff with DocumentDataParser

* still todo, exlude semanticNodes inside TableCells
This commit is contained in:
Kilian Schuettler 2024-07-10 19:48:42 +02:00
parent 3a57d26e97
commit 7bb2293915
9 changed files with 149 additions and 74 deletions

View File

@ -10,36 +10,25 @@ import lombok.NonNull;
@Builder
@Schema(description = "Object containing all storage paths the service needs to know.")
public record LayoutParsingRequest(
@Schema(description = "Enum specifying the type of layout parsing to be performed.", allowableValues = "{RedactManager, DocuMine, TAAS}")//
@NonNull LayoutParsingType layoutParsingType,
@Schema(description = "General purpose identifiers. They are not changed by the service at all and are returned as is in the response queue.")//
Map<String, String> identifier,
@Schema(description = "Path to the original PDF file.")//
@NonNull String originFileStorageId,//
@NonNull String originFileStorageId,
Optional<String> tablesFileStorageId,
Optional<String> imagesFileStorageId,
@Schema(description = "Optional Path to the table extraction file.")//
Optional<String> tablesFileStorageId,//
@Schema(description = "Optional Path to the image classification file.")//
Optional<String> imagesFileStorageId,//
Optional<String> visualLayoutParsingFileId,
@Schema(description = "Optional Path to the the visual layout parsing service file") Optional<String> visualLayoutParsingFileId,//
@Schema(description = "Path where the Document Structure File will be stored.")//
@NonNull String structureFileStorageId,//
@Schema(description = "Path where the Research Data File will be stored.")//
String researchDocumentStorageId,//
@Schema(description = "Path where the Document Text File will be stored.")//
@NonNull String textBlockFileStorageId,//
@Schema(description = "Path where the Document Positions File will be stored.")//
@NonNull String positionBlockFileStorageId,//
@Schema(description = "Path where the Document Pages File will be stored.")//
@NonNull String pageFileStorageId,//
@Schema(description = "Path where the Simplified Text File will be stored.")//
@NonNull String simplifiedTextStorageId,//
@Schema(description = "Path where the Viewer Document PDF will be stored.")//
@NonNull String viewerDocumentStorageId) {
@NonNull String structureFileStorageId,
String researchDocumentStorageId,
String markdownDocumentStorageId,
@NonNull String textBlockFileStorageId,
@NonNull String positionBlockFileStorageId,
@NonNull String pageFileStorageId,
@NonNull String simplifiedTextStorageId,
@NonNull String viewerDocumentStorageId
) {
}

View File

@ -4,6 +4,7 @@ import static java.lang.String.format;
import java.awt.geom.Point2D;
import java.awt.geom.Rectangle2D;
import java.io.ByteArrayInputStream;
import java.io.File;
import java.io.IOException;
import java.nio.file.Files;
@ -18,12 +19,15 @@ import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.common.PDRectangle;
import org.apache.pdfbox.pdmodel.documentinterchange.markedcontent.PDMarkedContent;
import org.commonmark.ext.gfm.tables.TablesExtension;
import org.commonmark.renderer.markdown.MarkdownRenderer;
import org.springframework.stereotype.Service;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeType;
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingFinishedEvent;
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingRequest;
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType;
import com.knecon.fforesight.service.layoutparser.processor.markdown.DocumentDataParser;
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationDocument;
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
@ -120,24 +124,18 @@ public class LayoutParsingPipeline {
File viewerDocumentFile = originFile;
VisualLayoutParsingResponse visualLayoutParsingResponse = new VisualLayoutParsingResponse();
if (layoutParsingRequest.visualLayoutParsingFileId()
.isPresent()) {
visualLayoutParsingResponse = layoutParsingStorageService.getVisualLayoutParsingFile(layoutParsingRequest.visualLayoutParsingFileId()
.get());
if (layoutParsingRequest.visualLayoutParsingFileId().isPresent()) {
visualLayoutParsingResponse = layoutParsingStorageService.getVisualLayoutParsingFile(layoutParsingRequest.visualLayoutParsingFileId().get());
}
ImageServiceResponse imageServiceResponse = new ImageServiceResponse();
if (layoutParsingRequest.imagesFileStorageId()
.isPresent()) {
imageServiceResponse = layoutParsingStorageService.getImagesFile(layoutParsingRequest.imagesFileStorageId()
.get());
if (layoutParsingRequest.imagesFileStorageId().isPresent()) {
imageServiceResponse = layoutParsingStorageService.getImagesFile(layoutParsingRequest.imagesFileStorageId().get());
}
TableServiceResponse tableServiceResponse = new TableServiceResponse();
if (layoutParsingRequest.tablesFileStorageId()
.isPresent()) {
tableServiceResponse = layoutParsingStorageService.getTablesFile(layoutParsingRequest.tablesFileStorageId()
.get());
if (layoutParsingRequest.tablesFileStorageId().isPresent()) {
tableServiceResponse = layoutParsingStorageService.getTablesFile(layoutParsingRequest.tablesFileStorageId().get());
}
ClassificationDocument classificationDocument = parseLayout(settings.getLayoutParsingTypeOverride() == null //
@ -155,12 +153,7 @@ public class LayoutParsingPipeline {
log.info("Creating viewer document for {}", layoutParsingRequest.identifier());
layoutGridService.addLayoutGrid(viewerDocumentFile,
documentGraph,
viewerDocumentFile,
false,
layoutParsingRequest.visualLayoutParsingFileId()
.isPresent());
layoutGridService.addLayoutGrid(viewerDocumentFile, documentGraph, viewerDocumentFile, false, layoutParsingRequest.visualLayoutParsingFileId().isPresent());
log.info("Storing resulting files for {}", layoutParsingRequest.identifier());
@ -174,6 +167,16 @@ public class LayoutParsingPipeline {
layoutParsingStorageService.storeResearchDocumentData(layoutParsingRequest, researchDocumentData);
}
if (layoutParsingRequest.markdownDocumentStorageId() != null) {
log.info("Rendering document data as markdown for {}", layoutParsingRequest.identifier());
var markdownDocument = DocumentDataParser.parse(documentGraph.streamAllSubNodes());
MarkdownRenderer renderer = MarkdownRenderer.builder().extensions(List.of(TablesExtension.create())).build();
String markdown = renderer.render(markdownDocument);
try (var in = new ByteArrayInputStream(markdown.getBytes())) {
layoutParsingStorageService.storeObject(layoutParsingRequest.markdownDocumentStorageId(), in);
}
}
if (!viewerDocumentFile.equals(originFile)) {
viewerDocumentFile.delete();
}

View File

@ -102,6 +102,11 @@ public class LayoutParsingStorageService {
storageService.storeJSONObject(TenantContext.getTenantId(), layoutParsingRequest.researchDocumentStorageId(), researchDocumentData);
}
public void storeObject(String storageId, InputStream in) {
storageService.storeObject(TenantContext.getTenantId(), storageId, in);
}
private File createTempFile(String filenamePrefix, String filenameSuffix) throws IOException {

View File

@ -2,6 +2,7 @@ package com.knecon.fforesight.service.layoutparser.processor.markdown;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Objects;
@ -16,8 +17,10 @@ import org.commonmark.ext.gfm.tables.TableHead;
import org.commonmark.ext.gfm.tables.TableRow;
import org.commonmark.node.Document;
import org.commonmark.node.Emphasis;
import org.commonmark.node.HardLineBreak;
import org.commonmark.node.Heading;
import org.commonmark.node.Node;
import org.commonmark.node.SoftLineBreak;
import org.commonmark.node.StrongEmphasis;
import org.commonmark.node.Text;
@ -88,14 +91,7 @@ public class DocumentDataParser {
private Node createTableCell(com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.TableCell tc) {
var cell = new TableCell();
if (tc.isLeaf()) {
parseTextBlock(tc.getTextBlock()).forEach(cell::appendChild);
} else {
tc.streamChildren()
.map(DocumentDataParser::parseNode)
.filter(Objects::nonNull)
.forEach(cell::appendChild);
}
parseTextBlock(tc.getTextBlock()).forEach(cell::appendChild);
return cell;
}
@ -103,7 +99,7 @@ public class DocumentDataParser {
private org.commonmark.node.Paragraph parseParagraph(Paragraph paragraph) {
org.commonmark.node.Paragraph heading = new org.commonmark.node.Paragraph();
parseTextBlock(paragraph.getTextBlock()).forEach(heading::appendChild);
parseTextBlockWithLineBreaks(paragraph.getTextBlock()).forEach(heading::appendChild);
return heading;
}
@ -112,11 +108,56 @@ public class DocumentDataParser {
Heading heading = new Heading();
heading.setLevel(headline.getTreeId().size());
parseTextBlock(headline.getTextBlock()).forEach(heading::appendChild);
parseTextBlockWithLineBreaks(headline.getTextBlock()).forEach(heading::appendChild);
return heading;
}
private List<Node> parseTextBlockWithLineBreaks(TextBlock textBlock) {
LinkedList<Node> result = new LinkedList<>();
List<TextRangeWithTextType> textRanges = mergeTextStyles(textBlock);
for (TextRangeWithTextType textRange : textRanges) {
if (textBlock.subSequenceWithLineBreaks(textRange.textRange()).equals("\n")) {
result.add(new HardLineBreak());
}
String text = textBlock.subSequenceWithLineBreaks(textRange.textRange());
String[] lines = text.split("\n");
for (String line : lines) {
String cleanedLine = line.trim();
if (cleanedLine.isEmpty()) {
result.add(new HardLineBreak());
continue;
}
switch (textRange.fontStyle()) {
case REGULAR -> result.add(new Text(cleanedLine));
case BOLD -> {
StrongEmphasis boldBlock = new StrongEmphasis();
boldBlock.appendChild(new Text(cleanedLine));
result.add(boldBlock);
}
case ITALIC -> {
Emphasis italicBlock = new Emphasis("_");
italicBlock.appendChild(new Text(cleanedLine));
result.add(italicBlock);
}
case BOLD_ITALIC -> {
Emphasis italicBlock = new Emphasis("_");
StrongEmphasis boldBlock = new StrongEmphasis();
boldBlock.appendChild(new Text(cleanedLine));
italicBlock.appendChild(boldBlock);
result.add(italicBlock);
}
}
result.add(new HardLineBreak());
}
}
result.removeLast();
return result;
}
private List<Node> parseTextBlock(TextBlock textBlock) {
@ -124,22 +165,22 @@ public class DocumentDataParser {
List<TextRangeWithTextType> textRanges = mergeTextStyles(textBlock);
for (TextRangeWithTextType textRange : textRanges) {
switch (textRange.fontStyle()) {
case REGULAR -> result.add(new Text(textBlock.subSequenceWithLineBreaks(textRange.textRange())));
case REGULAR -> result.add(new Text(textBlock.subSequence(textRange.textRange()).toString()));
case BOLD -> {
StrongEmphasis boldBlock = new StrongEmphasis();
boldBlock.appendChild(new Text(textBlock.subSequenceWithLineBreaks(textRange.textRange())));
boldBlock.appendChild(new Text(textBlock.subSequence(textRange.textRange()).toString()));
result.add(boldBlock);
}
case ITALIC -> {
Emphasis italicBlock = new Emphasis();
italicBlock.appendChild(new Text(textBlock.subSequenceWithLineBreaks(textRange.textRange())));
Emphasis italicBlock = new Emphasis("_");
italicBlock.appendChild(new Text(textBlock.subSequence(textRange.textRange()).toString()));
result.add(italicBlock);
}
case BOLD_ITALIC -> {
Emphasis italicBlock = new Emphasis();
Emphasis italicBlock = new Emphasis("_");
StrongEmphasis boldBlock = new StrongEmphasis();
boldBlock.appendChild(new Text(textBlock.subSequenceWithLineBreaks(textRange.textRange())));
boldBlock.appendChild(new Text(textBlock.subSequence(textRange.textRange()).toString()));
italicBlock.appendChild(boldBlock);
result.add(italicBlock);
@ -154,38 +195,49 @@ public class DocumentDataParser {
List<TextRangeWithTextType> result = new ArrayList<>();
TreeMap<Integer, Set<FontStyle>> styleChanges = new TreeMap<>();
TreeMap<Integer, Set<FontStyleChange>> styleChanges = new TreeMap<>();
int start = textBlock.getTextRange().start();
int end = textBlock.getTextRange().end();
for (TextRange bold : textBlock.getBoldTextBoundaries()) {
styleChanges.computeIfAbsent(bold.start(), k -> new HashSet<>()).add(FontStyle.BOLD);
styleChanges.computeIfAbsent(bold.end(), k -> new HashSet<>()).add(FontStyle.REGULAR);
styleChanges.computeIfAbsent(bold.start() + start, k -> new HashSet<>()).add(FontStyleChange.enter(FontStyle.BOLD));
styleChanges.computeIfAbsent(bold.end() + start, k -> new HashSet<>()).add(FontStyleChange.leave(FontStyle.BOLD));
}
for (TextRange italic : textBlock.getItalicTextBoundaries()) {
styleChanges.computeIfAbsent(italic.start(), k -> new HashSet<>()).add(FontStyle.ITALIC);
styleChanges.computeIfAbsent(italic.end(), k -> new HashSet<>()).add(FontStyle.REGULAR);
styleChanges.computeIfAbsent(italic.start() + start, k -> new HashSet<>()).add(FontStyleChange.enter(FontStyle.ITALIC));
styleChanges.computeIfAbsent(italic.end() + start, k -> new HashSet<>()).add(FontStyleChange.leave(FontStyle.ITALIC));
}
if (styleChanges.isEmpty()) {
result.add(new TextRangeWithTextType(new TextRange(0, textBlock.length()), FontStyle.REGULAR));
result.add(new TextRangeWithTextType(new TextRange(start, end), FontStyle.REGULAR));
return result;
}
int start = 0;
Set<FontStyle> currentStyles = new HashSet<>();
currentStyles.add(FontStyle.REGULAR);
for (Map.Entry<Integer, Set<FontStyle>> entry : styleChanges.entrySet()) {
for (Map.Entry<Integer, Set<FontStyleChange>> entry : styleChanges.entrySet()) {
int point = entry.getKey();
Set<FontStyle> changes = entry.getValue();
Set<FontStyleChange> changes = entry.getValue();
if (point > start) {
FontStyle style = determineFontStyle(currentStyles);
result.add(new TextRangeWithTextType(new TextRange(start, point), style));
}
currentStyles.removeAll(changes);
currentStyles.addAll(changes);
changes.stream()
.filter(FontStyleChange::leave)
.map(FontStyleChange::style)
.toList()
.forEach(currentStyles::remove);
currentStyles.addAll(changes.stream()
.filter(FontStyleChange::enter)
.map(FontStyleChange::style)
.toList());
if (currentStyles.isEmpty()) {
currentStyles.add(FontStyle.REGULAR);
}
@ -193,12 +245,14 @@ public class DocumentDataParser {
start = point;
}
if (start < textBlock.length()) {
if (start < end) {
FontStyle style = determineFontStyle(currentStyles);
result.add(new TextRangeWithTextType(new TextRange(start, textBlock.length()), style));
result.add(new TextRangeWithTextType(new TextRange(start, textBlock.getTextRange().end()), style));
}
return result;
return result.stream()
.filter(t -> t.textRange.length() > 1)
.toList();
}
@ -223,6 +277,27 @@ public class DocumentDataParser {
BOLD_ITALIC;
}
record FontStyleChange(boolean enter, FontStyle style) {
public static FontStyleChange enter(FontStyle style) {
return new FontStyleChange(true, style);
}
public static FontStyleChange leave(FontStyle style) {
return new FontStyleChange(false, style);
}
public boolean leave() {
return !enter;
}
}
record TextRangeWithTextType(TextRange textRange, FontStyle fontStyle) {
}

View File

@ -35,7 +35,6 @@ public class MessageHandler {
private final ObjectMapper objectMapper;
private final RabbitTemplate rabbitTemplate;
private final static String X_PIPELINE_PREFIX = "X-PIPE-";
private final LogFileWebEndpoint logFileWebEndpoint;
@RabbitHandler

View File

@ -34,7 +34,7 @@ public class LayoutparserEnd2EndTest extends AbstractTest {
@Test
public void testLayoutParserEndToEnd() {
String filePath = "files/Minimal Examples/RotateTextWithRulingsTestFile.pdf";
String filePath = "files/new/crafted document.pdf";
runForFile(filePath);
}
@ -79,9 +79,11 @@ public class LayoutparserEnd2EndTest extends AbstractTest {
.forEach(log::info);
File tmpFile = new File("/tmp/layoutparserEND2END/" + fileName + "_VIEWER.pdf");
File markdownTmpFile = new File("/tmp/layoutparserEND2END/" + fileName + ".md");
assert tmpFile.getParentFile().exists() || tmpFile.getParentFile().mkdirs();
storageService.downloadTo(TENANT_ID, layoutParsingRequest.viewerDocumentStorageId(), tmpFile);
storageService.downloadTo(TENANT_ID, layoutParsingRequest.markdownDocumentStorageId(), markdownTmpFile);
}

View File

@ -41,6 +41,7 @@ import lombok.SneakyThrows;
@Import(AbstractTest.TestConfiguration.class)
public abstract class AbstractTest {
public static final String MARKDOWN_FILE_ID = "markdown";
@Autowired
protected LayoutParsingStorageService layoutParsingStorageService;
@ -121,6 +122,7 @@ public abstract class AbstractTest {
.pageFileStorageId(fileName + PAGES_FILE_ID)
.simplifiedTextStorageId(fileName + SIMPLIFIED_ID)
.viewerDocumentStorageId(fileName + VIEWER_DOCUMENT_ID)
.markdownDocumentStorageId(fileName + MARKDOWN_FILE_ID)
.build();
}