diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingPipeline.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingPipeline.java index 1016dda..bb3e95d 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingPipeline.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingPipeline.java @@ -119,14 +119,18 @@ public class LayoutParsingPipeline { log.info("Starting layout parsing for {}", layoutParsingRequest.identifier()); File originFile = layoutParsingStorageService.getOriginFile(layoutParsingRequest.originFileStorageId()); - File viewerDocumentFile = layoutParsingStorageService.getViewerDocFile(layoutParsingRequest.viewerDocumentStorageId()).orElse(originFile); + File viewerDocumentFile = layoutParsingStorageService.getViewerDocFile(layoutParsingRequest.viewerDocumentStorageId()) + .orElse(originFile); VisualLayoutParsingResponse visualLayoutParsingResponse = layoutParsingRequest.visualLayoutParsingFileId() - .map(layoutParsingStorageService::getVisualLayoutParsingFile).orElse(new VisualLayoutParsingResponse()); + .map(layoutParsingStorageService::getVisualLayoutParsingFile) + .orElse(new VisualLayoutParsingResponse()); ImageServiceResponse imageServiceResponse = layoutParsingRequest.imagesFileStorageId() - .map(layoutParsingStorageService::getImagesFile).orElse(new ImageServiceResponse()); + .map(layoutParsingStorageService::getImagesFile) + .orElse(new ImageServiceResponse()); TableServiceResponse tableServiceResponse = layoutParsingRequest.tablesFileStorageId() - .map(layoutParsingStorageService::getTablesFile).orElse(new TableServiceResponse()); + .map(layoutParsingStorageService::getTablesFile) + .orElse(new TableServiceResponse()); ClassificationDocument classificationDocument = parseLayout(settings.getLayoutParsingTypeOverride() == null // ? layoutParsingRequest.layoutParsingType() : settings.getLayoutParsingTypeOverride(), @@ -143,13 +147,20 @@ public class LayoutParsingPipeline { log.info("Creating viewer document for {}", layoutParsingRequest.identifier()); - layoutGridService.addLayoutGrid(viewerDocumentFile, documentGraph, viewerDocumentFile, false, layoutParsingRequest.visualLayoutParsingFileId().isPresent()); + layoutGridService.addLayoutGrid(viewerDocumentFile, + documentGraph, + viewerDocumentFile, + false, + layoutParsingRequest.visualLayoutParsingFileId() + .isPresent()); log.info("Storing resulting files for {}", layoutParsingRequest.identifier()); layoutParsingStorageService.storeDocumentData(layoutParsingRequest, DocumentDataMapper.toDocumentData(documentGraph)); - if (layoutParsingRequest.documentMarkdownFileStorageId().isPresent()) { - layoutParsingStorageService.storeMarkdownFile(layoutParsingRequest.documentMarkdownFileStorageId().get(), new MarkdownMapper().toMarkdownContent(documentGraph)); + if (layoutParsingRequest.documentMarkdownFileStorageId() + .isPresent()) { + layoutParsingStorageService.storeMarkdownFile(layoutParsingRequest.documentMarkdownFileStorageId() + .get(), new MarkdownMapper().toMarkdownContent(documentGraph)); } layoutParsingStorageService.storeSimplifiedText(layoutParsingRequest, simplifiedSectionTextService.toSimplifiedText(documentGraph)); layoutParsingStorageService.storeViewerDocument(layoutParsingRequest, viewerDocumentFile); @@ -379,6 +390,12 @@ public class LayoutParsingPipeline { case CLARIFYND -> clarifyndClassificationService.classifyDocument(classificationDocument); } + if (layoutParsingType.equals(LayoutParsingType.DOCUMINE_OLD)) { + for (ClassificationPage page : classificationDocument.getPages()) { + docuMineBlockificationService.mergeblocks(page, page.getCleanRulings().withoutTextRulings(), 0, 10); + } + } + List headlines = classificationDocument.getPages() .stream() .flatMap(classificationPage -> classificationPage.getTextBlocks() diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/blockification/DocuMineBlockificationService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/blockification/DocuMineBlockificationService.java index c5d9e67..ca64f65 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/blockification/DocuMineBlockificationService.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/blockification/DocuMineBlockificationService.java @@ -2,19 +2,23 @@ package com.knecon.fforesight.service.layoutparser.processor.services.blockifica import java.util.ArrayList; import java.util.List; +import java.util.ListIterator; import java.util.regex.Matcher; import java.util.regex.Pattern; import java.util.stream.Collectors; import org.springframework.stereotype.Service; +import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.LayoutEngine; import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock; import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage; import com.knecon.fforesight.service.layoutparser.processor.model.Orientation; import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings; +import com.knecon.fforesight.service.layoutparser.processor.model.table.TablePageBlock; import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock; import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence; +@SuppressWarnings("all") @Service public class DocuMineBlockificationService { @@ -57,8 +61,11 @@ public class DocuMineBlockificationService { boolean isSplitByRuling = prev != null && usedRulings.lineBetween(prev, word); boolean splitByDir = prev != null && !prev.getDir().equals(word.getDir()); boolean splitByOtherFontAndOtherY = prev != null && Math.abs(prev.getMaxYDirAdj() - word.getMaxYDirAdj()) > word.getTextHeight() * 0.2 // - && (word.getFontStyle().contains("bold") && !prev.getFontStyle().contains("bold") // - || prev.getFontStyle().contains("bold") && !word.getFontStyle().contains("bold")); + && (word.getFontStyle().contains("bold") && !prev.getFontStyle().contains("bold") + // + || prev.getFontStyle().contains("bold") && !word.getFontStyle().contains("bold") + || Math.abs(prev.getFontSize() - word.getFontSize()) >= 1 + || Math.abs(word.getTextHeight() - prev.getTextHeight()) > 0.8); Matcher matcher = pattern.matcher(chunkWords.stream() .collect(Collectors.joining(" ")).toString()); @@ -120,5 +127,77 @@ public class DocuMineBlockificationService { return new ClassificationPage(textPageBlocks); } + + public void mergeblocks(ClassificationPage page, CleanRulings usedRulings, float xThreshold, float yThreshold) { + + var blocks = page.getTextBlocks(); + ListIterator itty = blocks.listIterator(); + while (itty.hasNext()) { + AbstractPageBlock block = itty.next(); + if (block == null) { + continue; + } + if (block instanceof TablePageBlock) { + continue; + } + + TextPageBlock current = (TextPageBlock) block; + + for (int i = 0; i < blocks.size(); i++) { + + AbstractPageBlock abstractPageBlock = blocks.get(i); + if (abstractPageBlock == null) { + continue; + } + if (abstractPageBlock == current) { + continue; + } + if (abstractPageBlock instanceof TablePageBlock) { + continue; + } + + if (isHeadlineFromOutline(current) || isHeadlineFromOutline(abstractPageBlock)) { + continue; + } + + TextPageBlock inner = (TextPageBlock) abstractPageBlock; + + if (usedRulings.lineBetween(current, blocks.get(i))) { + continue; + } + + if (current.getDir() == inner.getDir() && current.intersects(inner, yThreshold, xThreshold) && (current.getClassification() == null || current.getClassification() + .equals(inner.getClassification()))) { + + boolean toDuplicate = current.isToDuplicate() || inner.isToDuplicate(); + current.getSequences().addAll(inner.getSequences()); + current = buildTextBlock(current.getSequences(), 0); + current.setClassification(inner.getClassification()); + current.setToDuplicate(toDuplicate); + blocks.set(i, null); + itty.set(current); + } + } + } + var blocksIterator = blocks.iterator(); + while (blocksIterator.hasNext()) { + if (blocksIterator.next() == null) { + blocksIterator.remove(); + } + } + } + + + private boolean isHeadlineFromOutline(AbstractPageBlock abstractPageBlock) { + + return abstractPageBlock.getEngines().contains(LayoutEngine.OUTLINE) && abstractPageBlock.getClassification() != null && abstractPageBlock.getClassification().isHeadline(); + } + + + public static TextPageBlock buildTextBlock(List wordBlockList, int indexOnPage) { + + return new TextPageBlock(wordBlockList); + } + } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/classification/DocuMineClassificationService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/classification/DocuMineClassificationService.java index 3554372..34b8dd4 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/classification/DocuMineClassificationService.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/classification/DocuMineClassificationService.java @@ -74,7 +74,7 @@ public class DocuMineClassificationService { return; } if (document.getFontSizeCounter().getMostPopular() == null) { - textBlock.setClassification(PageBlockType.OTHER); + textBlock.setClassification(PageBlockType.PARAGRAPH); return; } if (MarkedContentUtils.intersects(textBlock, page.getMarkedContentBboxPerType(), MarkedContentUtils.HEADER) // @@ -108,7 +108,10 @@ public class DocuMineClassificationService { && Character.isDigit(textBlock.toString().charAt(0)) && atLeast3Matcher.reset().find() && !textBlock.toString().contains(":") // - || textBlock.toString().equals(textBlock.toString().toUpperCase(Locale.ROOT)) && atLeast3Matcher.reset().find() && !textBlock.toString().contains(":") // + || textBlock.toString().equals(textBlock.toString().toUpperCase(Locale.ROOT)) + && atLeast3Matcher.reset().find() + && !textBlock.toString().contains(":") + && !textBlock.toString().startsWith("(")// || textBlock.toString().startsWith("APPENDIX") // || textBlock.toString().startsWith("FIGURE") // || textBlock.toString().startsWith("Continued TABLE") // @@ -143,9 +146,9 @@ public class DocuMineClassificationService { && PositionUtils.getApproxLineCount(textBlock) < 2.9) { textBlock.setClassification(PageBlockType.PARAGRAPH_ITALIC); } else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock)) { - textBlock.setClassification(PageBlockType.PARAGRAPH_UNKNOWN); + textBlock.setClassification(PageBlockType.PARAGRAPH); } else { - textBlock.setClassification(PageBlockType.OTHER); + textBlock.setClassification(PageBlockType.PARAGRAPH); } }