CLARI-002: fix some stuff with DocumentDataParser
* still todo, exlude semanticNodes inside TableCells
This commit is contained in:
parent
3a57d26e97
commit
7bb2293915
@ -10,36 +10,25 @@ import lombok.NonNull;
|
||||
@Builder
|
||||
@Schema(description = "Object containing all storage paths the service needs to know.")
|
||||
public record LayoutParsingRequest(
|
||||
@Schema(description = "Enum specifying the type of layout parsing to be performed.", allowableValues = "{RedactManager, DocuMine, TAAS}")//
|
||||
@NonNull LayoutParsingType layoutParsingType,
|
||||
|
||||
@Schema(description = "General purpose identifiers. They are not changed by the service at all and are returned as is in the response queue.")//
|
||||
Map<String, String> identifier,
|
||||
|
||||
@Schema(description = "Path to the original PDF file.")//
|
||||
@NonNull String originFileStorageId,//
|
||||
@NonNull String originFileStorageId,
|
||||
|
||||
Optional<String> tablesFileStorageId,
|
||||
Optional<String> imagesFileStorageId,
|
||||
|
||||
@Schema(description = "Optional Path to the table extraction file.")//
|
||||
Optional<String> tablesFileStorageId,//
|
||||
@Schema(description = "Optional Path to the image classification file.")//
|
||||
Optional<String> imagesFileStorageId,//
|
||||
Optional<String> visualLayoutParsingFileId,
|
||||
|
||||
@Schema(description = "Optional Path to the the visual layout parsing service file") Optional<String> visualLayoutParsingFileId,//
|
||||
|
||||
@Schema(description = "Path where the Document Structure File will be stored.")//
|
||||
@NonNull String structureFileStorageId,//
|
||||
@Schema(description = "Path where the Research Data File will be stored.")//
|
||||
String researchDocumentStorageId,//
|
||||
@Schema(description = "Path where the Document Text File will be stored.")//
|
||||
@NonNull String textBlockFileStorageId,//
|
||||
@Schema(description = "Path where the Document Positions File will be stored.")//
|
||||
@NonNull String positionBlockFileStorageId,//
|
||||
@Schema(description = "Path where the Document Pages File will be stored.")//
|
||||
@NonNull String pageFileStorageId,//
|
||||
@Schema(description = "Path where the Simplified Text File will be stored.")//
|
||||
@NonNull String simplifiedTextStorageId,//
|
||||
@Schema(description = "Path where the Viewer Document PDF will be stored.")//
|
||||
@NonNull String viewerDocumentStorageId) {
|
||||
@NonNull String structureFileStorageId,
|
||||
String researchDocumentStorageId,
|
||||
String markdownDocumentStorageId,
|
||||
@NonNull String textBlockFileStorageId,
|
||||
@NonNull String positionBlockFileStorageId,
|
||||
@NonNull String pageFileStorageId,
|
||||
@NonNull String simplifiedTextStorageId,
|
||||
@NonNull String viewerDocumentStorageId
|
||||
) {
|
||||
|
||||
}
|
||||
|
||||
@ -4,6 +4,7 @@ import static java.lang.String.format;
|
||||
|
||||
import java.awt.geom.Point2D;
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.io.ByteArrayInputStream;
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
import java.nio.file.Files;
|
||||
@ -18,12 +19,15 @@ import org.apache.pdfbox.pdmodel.PDDocument;
|
||||
import org.apache.pdfbox.pdmodel.PDPage;
|
||||
import org.apache.pdfbox.pdmodel.common.PDRectangle;
|
||||
import org.apache.pdfbox.pdmodel.documentinterchange.markedcontent.PDMarkedContent;
|
||||
import org.commonmark.ext.gfm.tables.TablesExtension;
|
||||
import org.commonmark.renderer.markdown.MarkdownRenderer;
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeType;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingFinishedEvent;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingRequest;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.markdown.DocumentDataParser;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationDocument;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
|
||||
@ -120,24 +124,18 @@ public class LayoutParsingPipeline {
|
||||
File viewerDocumentFile = originFile;
|
||||
|
||||
VisualLayoutParsingResponse visualLayoutParsingResponse = new VisualLayoutParsingResponse();
|
||||
if (layoutParsingRequest.visualLayoutParsingFileId()
|
||||
.isPresent()) {
|
||||
visualLayoutParsingResponse = layoutParsingStorageService.getVisualLayoutParsingFile(layoutParsingRequest.visualLayoutParsingFileId()
|
||||
.get());
|
||||
if (layoutParsingRequest.visualLayoutParsingFileId().isPresent()) {
|
||||
visualLayoutParsingResponse = layoutParsingStorageService.getVisualLayoutParsingFile(layoutParsingRequest.visualLayoutParsingFileId().get());
|
||||
}
|
||||
|
||||
ImageServiceResponse imageServiceResponse = new ImageServiceResponse();
|
||||
if (layoutParsingRequest.imagesFileStorageId()
|
||||
.isPresent()) {
|
||||
imageServiceResponse = layoutParsingStorageService.getImagesFile(layoutParsingRequest.imagesFileStorageId()
|
||||
.get());
|
||||
if (layoutParsingRequest.imagesFileStorageId().isPresent()) {
|
||||
imageServiceResponse = layoutParsingStorageService.getImagesFile(layoutParsingRequest.imagesFileStorageId().get());
|
||||
}
|
||||
|
||||
TableServiceResponse tableServiceResponse = new TableServiceResponse();
|
||||
if (layoutParsingRequest.tablesFileStorageId()
|
||||
.isPresent()) {
|
||||
tableServiceResponse = layoutParsingStorageService.getTablesFile(layoutParsingRequest.tablesFileStorageId()
|
||||
.get());
|
||||
if (layoutParsingRequest.tablesFileStorageId().isPresent()) {
|
||||
tableServiceResponse = layoutParsingStorageService.getTablesFile(layoutParsingRequest.tablesFileStorageId().get());
|
||||
}
|
||||
|
||||
ClassificationDocument classificationDocument = parseLayout(settings.getLayoutParsingTypeOverride() == null //
|
||||
@ -155,12 +153,7 @@ public class LayoutParsingPipeline {
|
||||
|
||||
log.info("Creating viewer document for {}", layoutParsingRequest.identifier());
|
||||
|
||||
layoutGridService.addLayoutGrid(viewerDocumentFile,
|
||||
documentGraph,
|
||||
viewerDocumentFile,
|
||||
false,
|
||||
layoutParsingRequest.visualLayoutParsingFileId()
|
||||
.isPresent());
|
||||
layoutGridService.addLayoutGrid(viewerDocumentFile, documentGraph, viewerDocumentFile, false, layoutParsingRequest.visualLayoutParsingFileId().isPresent());
|
||||
|
||||
log.info("Storing resulting files for {}", layoutParsingRequest.identifier());
|
||||
|
||||
@ -174,6 +167,16 @@ public class LayoutParsingPipeline {
|
||||
layoutParsingStorageService.storeResearchDocumentData(layoutParsingRequest, researchDocumentData);
|
||||
}
|
||||
|
||||
if (layoutParsingRequest.markdownDocumentStorageId() != null) {
|
||||
log.info("Rendering document data as markdown for {}", layoutParsingRequest.identifier());
|
||||
var markdownDocument = DocumentDataParser.parse(documentGraph.streamAllSubNodes());
|
||||
MarkdownRenderer renderer = MarkdownRenderer.builder().extensions(List.of(TablesExtension.create())).build();
|
||||
String markdown = renderer.render(markdownDocument);
|
||||
try (var in = new ByteArrayInputStream(markdown.getBytes())) {
|
||||
layoutParsingStorageService.storeObject(layoutParsingRequest.markdownDocumentStorageId(), in);
|
||||
}
|
||||
}
|
||||
|
||||
if (!viewerDocumentFile.equals(originFile)) {
|
||||
viewerDocumentFile.delete();
|
||||
}
|
||||
|
||||
@ -102,6 +102,11 @@ public class LayoutParsingStorageService {
|
||||
storageService.storeJSONObject(TenantContext.getTenantId(), layoutParsingRequest.researchDocumentStorageId(), researchDocumentData);
|
||||
}
|
||||
|
||||
public void storeObject(String storageId, InputStream in) {
|
||||
|
||||
storageService.storeObject(TenantContext.getTenantId(), storageId, in);
|
||||
}
|
||||
|
||||
|
||||
private File createTempFile(String filenamePrefix, String filenameSuffix) throws IOException {
|
||||
|
||||
|
||||
@ -2,6 +2,7 @@ package com.knecon.fforesight.service.layoutparser.processor.markdown;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashSet;
|
||||
import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Objects;
|
||||
@ -16,8 +17,10 @@ import org.commonmark.ext.gfm.tables.TableHead;
|
||||
import org.commonmark.ext.gfm.tables.TableRow;
|
||||
import org.commonmark.node.Document;
|
||||
import org.commonmark.node.Emphasis;
|
||||
import org.commonmark.node.HardLineBreak;
|
||||
import org.commonmark.node.Heading;
|
||||
import org.commonmark.node.Node;
|
||||
import org.commonmark.node.SoftLineBreak;
|
||||
import org.commonmark.node.StrongEmphasis;
|
||||
import org.commonmark.node.Text;
|
||||
|
||||
@ -88,14 +91,7 @@ public class DocumentDataParser {
|
||||
private Node createTableCell(com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.TableCell tc) {
|
||||
|
||||
var cell = new TableCell();
|
||||
if (tc.isLeaf()) {
|
||||
parseTextBlock(tc.getTextBlock()).forEach(cell::appendChild);
|
||||
} else {
|
||||
tc.streamChildren()
|
||||
.map(DocumentDataParser::parseNode)
|
||||
.filter(Objects::nonNull)
|
||||
.forEach(cell::appendChild);
|
||||
}
|
||||
parseTextBlock(tc.getTextBlock()).forEach(cell::appendChild);
|
||||
return cell;
|
||||
}
|
||||
|
||||
@ -103,7 +99,7 @@ public class DocumentDataParser {
|
||||
private org.commonmark.node.Paragraph parseParagraph(Paragraph paragraph) {
|
||||
|
||||
org.commonmark.node.Paragraph heading = new org.commonmark.node.Paragraph();
|
||||
parseTextBlock(paragraph.getTextBlock()).forEach(heading::appendChild);
|
||||
parseTextBlockWithLineBreaks(paragraph.getTextBlock()).forEach(heading::appendChild);
|
||||
return heading;
|
||||
}
|
||||
|
||||
@ -112,11 +108,56 @@ public class DocumentDataParser {
|
||||
|
||||
Heading heading = new Heading();
|
||||
heading.setLevel(headline.getTreeId().size());
|
||||
parseTextBlock(headline.getTextBlock()).forEach(heading::appendChild);
|
||||
parseTextBlockWithLineBreaks(headline.getTextBlock()).forEach(heading::appendChild);
|
||||
return heading;
|
||||
|
||||
}
|
||||
|
||||
private List<Node> parseTextBlockWithLineBreaks(TextBlock textBlock) {
|
||||
|
||||
LinkedList<Node> result = new LinkedList<>();
|
||||
List<TextRangeWithTextType> textRanges = mergeTextStyles(textBlock);
|
||||
for (TextRangeWithTextType textRange : textRanges) {
|
||||
if (textBlock.subSequenceWithLineBreaks(textRange.textRange()).equals("\n")) {
|
||||
result.add(new HardLineBreak());
|
||||
}
|
||||
String text = textBlock.subSequenceWithLineBreaks(textRange.textRange());
|
||||
String[] lines = text.split("\n");
|
||||
for (String line : lines) {
|
||||
String cleanedLine = line.trim();
|
||||
if (cleanedLine.isEmpty()) {
|
||||
result.add(new HardLineBreak());
|
||||
continue;
|
||||
}
|
||||
switch (textRange.fontStyle()) {
|
||||
case REGULAR -> result.add(new Text(cleanedLine));
|
||||
case BOLD -> {
|
||||
StrongEmphasis boldBlock = new StrongEmphasis();
|
||||
boldBlock.appendChild(new Text(cleanedLine));
|
||||
result.add(boldBlock);
|
||||
}
|
||||
case ITALIC -> {
|
||||
Emphasis italicBlock = new Emphasis("_");
|
||||
italicBlock.appendChild(new Text(cleanedLine));
|
||||
result.add(italicBlock);
|
||||
}
|
||||
case BOLD_ITALIC -> {
|
||||
Emphasis italicBlock = new Emphasis("_");
|
||||
|
||||
StrongEmphasis boldBlock = new StrongEmphasis();
|
||||
boldBlock.appendChild(new Text(cleanedLine));
|
||||
|
||||
italicBlock.appendChild(boldBlock);
|
||||
result.add(italicBlock);
|
||||
}
|
||||
}
|
||||
result.add(new HardLineBreak());
|
||||
}
|
||||
}
|
||||
result.removeLast();
|
||||
return result;
|
||||
}
|
||||
|
||||
|
||||
private List<Node> parseTextBlock(TextBlock textBlock) {
|
||||
|
||||
@ -124,22 +165,22 @@ public class DocumentDataParser {
|
||||
List<TextRangeWithTextType> textRanges = mergeTextStyles(textBlock);
|
||||
for (TextRangeWithTextType textRange : textRanges) {
|
||||
switch (textRange.fontStyle()) {
|
||||
case REGULAR -> result.add(new Text(textBlock.subSequenceWithLineBreaks(textRange.textRange())));
|
||||
case REGULAR -> result.add(new Text(textBlock.subSequence(textRange.textRange()).toString()));
|
||||
case BOLD -> {
|
||||
StrongEmphasis boldBlock = new StrongEmphasis();
|
||||
boldBlock.appendChild(new Text(textBlock.subSequenceWithLineBreaks(textRange.textRange())));
|
||||
boldBlock.appendChild(new Text(textBlock.subSequence(textRange.textRange()).toString()));
|
||||
result.add(boldBlock);
|
||||
}
|
||||
case ITALIC -> {
|
||||
Emphasis italicBlock = new Emphasis();
|
||||
italicBlock.appendChild(new Text(textBlock.subSequenceWithLineBreaks(textRange.textRange())));
|
||||
Emphasis italicBlock = new Emphasis("_");
|
||||
italicBlock.appendChild(new Text(textBlock.subSequence(textRange.textRange()).toString()));
|
||||
result.add(italicBlock);
|
||||
}
|
||||
case BOLD_ITALIC -> {
|
||||
Emphasis italicBlock = new Emphasis();
|
||||
Emphasis italicBlock = new Emphasis("_");
|
||||
|
||||
StrongEmphasis boldBlock = new StrongEmphasis();
|
||||
boldBlock.appendChild(new Text(textBlock.subSequenceWithLineBreaks(textRange.textRange())));
|
||||
boldBlock.appendChild(new Text(textBlock.subSequence(textRange.textRange()).toString()));
|
||||
|
||||
italicBlock.appendChild(boldBlock);
|
||||
result.add(italicBlock);
|
||||
@ -154,38 +195,49 @@ public class DocumentDataParser {
|
||||
|
||||
List<TextRangeWithTextType> result = new ArrayList<>();
|
||||
|
||||
TreeMap<Integer, Set<FontStyle>> styleChanges = new TreeMap<>();
|
||||
TreeMap<Integer, Set<FontStyleChange>> styleChanges = new TreeMap<>();
|
||||
|
||||
int start = textBlock.getTextRange().start();
|
||||
int end = textBlock.getTextRange().end();
|
||||
|
||||
for (TextRange bold : textBlock.getBoldTextBoundaries()) {
|
||||
styleChanges.computeIfAbsent(bold.start(), k -> new HashSet<>()).add(FontStyle.BOLD);
|
||||
styleChanges.computeIfAbsent(bold.end(), k -> new HashSet<>()).add(FontStyle.REGULAR);
|
||||
styleChanges.computeIfAbsent(bold.start() + start, k -> new HashSet<>()).add(FontStyleChange.enter(FontStyle.BOLD));
|
||||
styleChanges.computeIfAbsent(bold.end() + start, k -> new HashSet<>()).add(FontStyleChange.leave(FontStyle.BOLD));
|
||||
}
|
||||
|
||||
for (TextRange italic : textBlock.getItalicTextBoundaries()) {
|
||||
styleChanges.computeIfAbsent(italic.start(), k -> new HashSet<>()).add(FontStyle.ITALIC);
|
||||
styleChanges.computeIfAbsent(italic.end(), k -> new HashSet<>()).add(FontStyle.REGULAR);
|
||||
styleChanges.computeIfAbsent(italic.start() + start, k -> new HashSet<>()).add(FontStyleChange.enter(FontStyle.ITALIC));
|
||||
styleChanges.computeIfAbsent(italic.end() + start, k -> new HashSet<>()).add(FontStyleChange.leave(FontStyle.ITALIC));
|
||||
}
|
||||
|
||||
if (styleChanges.isEmpty()) {
|
||||
result.add(new TextRangeWithTextType(new TextRange(0, textBlock.length()), FontStyle.REGULAR));
|
||||
result.add(new TextRangeWithTextType(new TextRange(start, end), FontStyle.REGULAR));
|
||||
return result;
|
||||
}
|
||||
|
||||
int start = 0;
|
||||
Set<FontStyle> currentStyles = new HashSet<>();
|
||||
currentStyles.add(FontStyle.REGULAR);
|
||||
|
||||
for (Map.Entry<Integer, Set<FontStyle>> entry : styleChanges.entrySet()) {
|
||||
for (Map.Entry<Integer, Set<FontStyleChange>> entry : styleChanges.entrySet()) {
|
||||
int point = entry.getKey();
|
||||
Set<FontStyle> changes = entry.getValue();
|
||||
Set<FontStyleChange> changes = entry.getValue();
|
||||
|
||||
if (point > start) {
|
||||
FontStyle style = determineFontStyle(currentStyles);
|
||||
result.add(new TextRangeWithTextType(new TextRange(start, point), style));
|
||||
}
|
||||
|
||||
currentStyles.removeAll(changes);
|
||||
currentStyles.addAll(changes);
|
||||
changes.stream()
|
||||
.filter(FontStyleChange::leave)
|
||||
.map(FontStyleChange::style)
|
||||
.toList()
|
||||
.forEach(currentStyles::remove);
|
||||
|
||||
currentStyles.addAll(changes.stream()
|
||||
.filter(FontStyleChange::enter)
|
||||
.map(FontStyleChange::style)
|
||||
.toList());
|
||||
|
||||
if (currentStyles.isEmpty()) {
|
||||
currentStyles.add(FontStyle.REGULAR);
|
||||
}
|
||||
@ -193,12 +245,14 @@ public class DocumentDataParser {
|
||||
start = point;
|
||||
}
|
||||
|
||||
if (start < textBlock.length()) {
|
||||
if (start < end) {
|
||||
FontStyle style = determineFontStyle(currentStyles);
|
||||
result.add(new TextRangeWithTextType(new TextRange(start, textBlock.length()), style));
|
||||
result.add(new TextRangeWithTextType(new TextRange(start, textBlock.getTextRange().end()), style));
|
||||
}
|
||||
|
||||
return result;
|
||||
return result.stream()
|
||||
.filter(t -> t.textRange.length() > 1)
|
||||
.toList();
|
||||
}
|
||||
|
||||
|
||||
@ -223,6 +277,27 @@ public class DocumentDataParser {
|
||||
BOLD_ITALIC;
|
||||
}
|
||||
|
||||
record FontStyleChange(boolean enter, FontStyle style) {
|
||||
|
||||
public static FontStyleChange enter(FontStyle style) {
|
||||
|
||||
return new FontStyleChange(true, style);
|
||||
}
|
||||
|
||||
|
||||
public static FontStyleChange leave(FontStyle style) {
|
||||
|
||||
return new FontStyleChange(false, style);
|
||||
}
|
||||
|
||||
|
||||
public boolean leave() {
|
||||
|
||||
return !enter;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
record TextRangeWithTextType(TextRange textRange, FontStyle fontStyle) {
|
||||
|
||||
}
|
||||
|
||||
@ -35,7 +35,6 @@ public class MessageHandler {
|
||||
private final ObjectMapper objectMapper;
|
||||
private final RabbitTemplate rabbitTemplate;
|
||||
private final static String X_PIPELINE_PREFIX = "X-PIPE-";
|
||||
private final LogFileWebEndpoint logFileWebEndpoint;
|
||||
|
||||
|
||||
@RabbitHandler
|
||||
|
||||
@ -34,7 +34,7 @@ public class LayoutparserEnd2EndTest extends AbstractTest {
|
||||
@Test
|
||||
public void testLayoutParserEndToEnd() {
|
||||
|
||||
String filePath = "files/Minimal Examples/RotateTextWithRulingsTestFile.pdf";
|
||||
String filePath = "files/new/crafted document.pdf";
|
||||
|
||||
runForFile(filePath);
|
||||
}
|
||||
@ -79,9 +79,11 @@ public class LayoutparserEnd2EndTest extends AbstractTest {
|
||||
.forEach(log::info);
|
||||
|
||||
File tmpFile = new File("/tmp/layoutparserEND2END/" + fileName + "_VIEWER.pdf");
|
||||
File markdownTmpFile = new File("/tmp/layoutparserEND2END/" + fileName + ".md");
|
||||
assert tmpFile.getParentFile().exists() || tmpFile.getParentFile().mkdirs();
|
||||
|
||||
storageService.downloadTo(TENANT_ID, layoutParsingRequest.viewerDocumentStorageId(), tmpFile);
|
||||
storageService.downloadTo(TENANT_ID, layoutParsingRequest.markdownDocumentStorageId(), markdownTmpFile);
|
||||
}
|
||||
|
||||
|
||||
|
||||
@ -41,6 +41,7 @@ import lombok.SneakyThrows;
|
||||
@Import(AbstractTest.TestConfiguration.class)
|
||||
public abstract class AbstractTest {
|
||||
|
||||
public static final String MARKDOWN_FILE_ID = "markdown";
|
||||
@Autowired
|
||||
protected LayoutParsingStorageService layoutParsingStorageService;
|
||||
|
||||
@ -121,6 +122,7 @@ public abstract class AbstractTest {
|
||||
.pageFileStorageId(fileName + PAGES_FILE_ID)
|
||||
.simplifiedTextStorageId(fileName + SIMPLIFIED_ID)
|
||||
.viewerDocumentStorageId(fileName + VIEWER_DOCUMENT_ID)
|
||||
.markdownDocumentStorageId(fileName + MARKDOWN_FILE_ID)
|
||||
.build();
|
||||
}
|
||||
|
||||
|
||||
Binary file not shown.
Binary file not shown.
Loading…
x
Reference in New Issue
Block a user