RED-9974: Improved headline detection for documine old
This commit is contained in:
parent
e3e9d16145
commit
bb40345f79
@ -119,14 +119,18 @@ public class LayoutParsingPipeline {
|
||||
log.info("Starting layout parsing for {}", layoutParsingRequest.identifier());
|
||||
|
||||
File originFile = layoutParsingStorageService.getOriginFile(layoutParsingRequest.originFileStorageId());
|
||||
File viewerDocumentFile = layoutParsingStorageService.getViewerDocFile(layoutParsingRequest.viewerDocumentStorageId()).orElse(originFile);
|
||||
File viewerDocumentFile = layoutParsingStorageService.getViewerDocFile(layoutParsingRequest.viewerDocumentStorageId())
|
||||
.orElse(originFile);
|
||||
|
||||
VisualLayoutParsingResponse visualLayoutParsingResponse = layoutParsingRequest.visualLayoutParsingFileId()
|
||||
.map(layoutParsingStorageService::getVisualLayoutParsingFile).orElse(new VisualLayoutParsingResponse());
|
||||
.map(layoutParsingStorageService::getVisualLayoutParsingFile)
|
||||
.orElse(new VisualLayoutParsingResponse());
|
||||
ImageServiceResponse imageServiceResponse = layoutParsingRequest.imagesFileStorageId()
|
||||
.map(layoutParsingStorageService::getImagesFile).orElse(new ImageServiceResponse());
|
||||
.map(layoutParsingStorageService::getImagesFile)
|
||||
.orElse(new ImageServiceResponse());
|
||||
TableServiceResponse tableServiceResponse = layoutParsingRequest.tablesFileStorageId()
|
||||
.map(layoutParsingStorageService::getTablesFile).orElse(new TableServiceResponse());
|
||||
.map(layoutParsingStorageService::getTablesFile)
|
||||
.orElse(new TableServiceResponse());
|
||||
|
||||
ClassificationDocument classificationDocument = parseLayout(settings.getLayoutParsingTypeOverride() == null //
|
||||
? layoutParsingRequest.layoutParsingType() : settings.getLayoutParsingTypeOverride(),
|
||||
@ -143,13 +147,20 @@ public class LayoutParsingPipeline {
|
||||
|
||||
log.info("Creating viewer document for {}", layoutParsingRequest.identifier());
|
||||
|
||||
layoutGridService.addLayoutGrid(viewerDocumentFile, documentGraph, viewerDocumentFile, false, layoutParsingRequest.visualLayoutParsingFileId().isPresent());
|
||||
layoutGridService.addLayoutGrid(viewerDocumentFile,
|
||||
documentGraph,
|
||||
viewerDocumentFile,
|
||||
false,
|
||||
layoutParsingRequest.visualLayoutParsingFileId()
|
||||
.isPresent());
|
||||
|
||||
log.info("Storing resulting files for {}", layoutParsingRequest.identifier());
|
||||
|
||||
layoutParsingStorageService.storeDocumentData(layoutParsingRequest, DocumentDataMapper.toDocumentData(documentGraph));
|
||||
if (layoutParsingRequest.documentMarkdownFileStorageId().isPresent()) {
|
||||
layoutParsingStorageService.storeMarkdownFile(layoutParsingRequest.documentMarkdownFileStorageId().get(), new MarkdownMapper().toMarkdownContent(documentGraph));
|
||||
if (layoutParsingRequest.documentMarkdownFileStorageId()
|
||||
.isPresent()) {
|
||||
layoutParsingStorageService.storeMarkdownFile(layoutParsingRequest.documentMarkdownFileStorageId()
|
||||
.get(), new MarkdownMapper().toMarkdownContent(documentGraph));
|
||||
}
|
||||
layoutParsingStorageService.storeSimplifiedText(layoutParsingRequest, simplifiedSectionTextService.toSimplifiedText(documentGraph));
|
||||
layoutParsingStorageService.storeViewerDocument(layoutParsingRequest, viewerDocumentFile);
|
||||
@ -379,6 +390,12 @@ public class LayoutParsingPipeline {
|
||||
case CLARIFYND -> clarifyndClassificationService.classifyDocument(classificationDocument);
|
||||
}
|
||||
|
||||
if (layoutParsingType.equals(LayoutParsingType.DOCUMINE_OLD)) {
|
||||
for (ClassificationPage page : classificationDocument.getPages()) {
|
||||
docuMineBlockificationService.mergeblocks(page, page.getCleanRulings().withoutTextRulings(), 0, 10);
|
||||
}
|
||||
}
|
||||
|
||||
List<TextPageBlock> headlines = classificationDocument.getPages()
|
||||
.stream()
|
||||
.flatMap(classificationPage -> classificationPage.getTextBlocks()
|
||||
|
||||
@ -2,19 +2,23 @@ package com.knecon.fforesight.service.layoutparser.processor.services.blockifica
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.ListIterator;
|
||||
import java.util.regex.Matcher;
|
||||
import java.util.regex.Pattern;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.LayoutEngine;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.Orientation;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.TablePageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
|
||||
|
||||
@SuppressWarnings("all")
|
||||
@Service
|
||||
public class DocuMineBlockificationService {
|
||||
|
||||
@ -57,8 +61,11 @@ public class DocuMineBlockificationService {
|
||||
boolean isSplitByRuling = prev != null && usedRulings.lineBetween(prev, word);
|
||||
boolean splitByDir = prev != null && !prev.getDir().equals(word.getDir());
|
||||
boolean splitByOtherFontAndOtherY = prev != null && Math.abs(prev.getMaxYDirAdj() - word.getMaxYDirAdj()) > word.getTextHeight() * 0.2 //
|
||||
&& (word.getFontStyle().contains("bold") && !prev.getFontStyle().contains("bold") //
|
||||
|| prev.getFontStyle().contains("bold") && !word.getFontStyle().contains("bold"));
|
||||
&& (word.getFontStyle().contains("bold") && !prev.getFontStyle().contains("bold")
|
||||
//
|
||||
|| prev.getFontStyle().contains("bold") && !word.getFontStyle().contains("bold")
|
||||
|| Math.abs(prev.getFontSize() - word.getFontSize()) >= 1
|
||||
|| Math.abs(word.getTextHeight() - prev.getTextHeight()) > 0.8);
|
||||
|
||||
Matcher matcher = pattern.matcher(chunkWords.stream()
|
||||
.collect(Collectors.joining(" ")).toString());
|
||||
@ -120,5 +127,77 @@ public class DocuMineBlockificationService {
|
||||
return new ClassificationPage(textPageBlocks);
|
||||
}
|
||||
|
||||
|
||||
public void mergeblocks(ClassificationPage page, CleanRulings usedRulings, float xThreshold, float yThreshold) {
|
||||
|
||||
var blocks = page.getTextBlocks();
|
||||
ListIterator<AbstractPageBlock> itty = blocks.listIterator();
|
||||
while (itty.hasNext()) {
|
||||
AbstractPageBlock block = itty.next();
|
||||
if (block == null) {
|
||||
continue;
|
||||
}
|
||||
if (block instanceof TablePageBlock) {
|
||||
continue;
|
||||
}
|
||||
|
||||
TextPageBlock current = (TextPageBlock) block;
|
||||
|
||||
for (int i = 0; i < blocks.size(); i++) {
|
||||
|
||||
AbstractPageBlock abstractPageBlock = blocks.get(i);
|
||||
if (abstractPageBlock == null) {
|
||||
continue;
|
||||
}
|
||||
if (abstractPageBlock == current) {
|
||||
continue;
|
||||
}
|
||||
if (abstractPageBlock instanceof TablePageBlock) {
|
||||
continue;
|
||||
}
|
||||
|
||||
if (isHeadlineFromOutline(current) || isHeadlineFromOutline(abstractPageBlock)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
TextPageBlock inner = (TextPageBlock) abstractPageBlock;
|
||||
|
||||
if (usedRulings.lineBetween(current, blocks.get(i))) {
|
||||
continue;
|
||||
}
|
||||
|
||||
if (current.getDir() == inner.getDir() && current.intersects(inner, yThreshold, xThreshold) && (current.getClassification() == null || current.getClassification()
|
||||
.equals(inner.getClassification()))) {
|
||||
|
||||
boolean toDuplicate = current.isToDuplicate() || inner.isToDuplicate();
|
||||
current.getSequences().addAll(inner.getSequences());
|
||||
current = buildTextBlock(current.getSequences(), 0);
|
||||
current.setClassification(inner.getClassification());
|
||||
current.setToDuplicate(toDuplicate);
|
||||
blocks.set(i, null);
|
||||
itty.set(current);
|
||||
}
|
||||
}
|
||||
}
|
||||
var blocksIterator = blocks.iterator();
|
||||
while (blocksIterator.hasNext()) {
|
||||
if (blocksIterator.next() == null) {
|
||||
blocksIterator.remove();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
private boolean isHeadlineFromOutline(AbstractPageBlock abstractPageBlock) {
|
||||
|
||||
return abstractPageBlock.getEngines().contains(LayoutEngine.OUTLINE) && abstractPageBlock.getClassification() != null && abstractPageBlock.getClassification().isHeadline();
|
||||
}
|
||||
|
||||
|
||||
public static TextPageBlock buildTextBlock(List<TextPositionSequence> wordBlockList, int indexOnPage) {
|
||||
|
||||
return new TextPageBlock(wordBlockList);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
@ -74,7 +74,7 @@ public class DocuMineClassificationService {
|
||||
return;
|
||||
}
|
||||
if (document.getFontSizeCounter().getMostPopular() == null) {
|
||||
textBlock.setClassification(PageBlockType.OTHER);
|
||||
textBlock.setClassification(PageBlockType.PARAGRAPH);
|
||||
return;
|
||||
}
|
||||
if (MarkedContentUtils.intersects(textBlock, page.getMarkedContentBboxPerType(), MarkedContentUtils.HEADER) //
|
||||
@ -108,7 +108,10 @@ public class DocuMineClassificationService {
|
||||
&& Character.isDigit(textBlock.toString().charAt(0))
|
||||
&& atLeast3Matcher.reset().find()
|
||||
&& !textBlock.toString().contains(":") //
|
||||
|| textBlock.toString().equals(textBlock.toString().toUpperCase(Locale.ROOT)) && atLeast3Matcher.reset().find() && !textBlock.toString().contains(":") //
|
||||
|| textBlock.toString().equals(textBlock.toString().toUpperCase(Locale.ROOT))
|
||||
&& atLeast3Matcher.reset().find()
|
||||
&& !textBlock.toString().contains(":")
|
||||
&& !textBlock.toString().startsWith("(")//
|
||||
|| textBlock.toString().startsWith("APPENDIX") //
|
||||
|| textBlock.toString().startsWith("FIGURE") //
|
||||
|| textBlock.toString().startsWith("Continued TABLE") //
|
||||
@ -143,9 +146,9 @@ public class DocuMineClassificationService {
|
||||
&& PositionUtils.getApproxLineCount(textBlock) < 2.9) {
|
||||
textBlock.setClassification(PageBlockType.PARAGRAPH_ITALIC);
|
||||
} else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock)) {
|
||||
textBlock.setClassification(PageBlockType.PARAGRAPH_UNKNOWN);
|
||||
textBlock.setClassification(PageBlockType.PARAGRAPH);
|
||||
} else {
|
||||
textBlock.setClassification(PageBlockType.OTHER);
|
||||
textBlock.setClassification(PageBlockType.PARAGRAPH);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user