Merge branch 'Clarifynd' into 'main'
Clarifynd See merge request fforesight/layout-parser!113
This commit is contained in:
commit
f4cae8a7dc
@ -47,6 +47,7 @@ import com.knecon.fforesight.service.layoutparser.processor.services.TableExtrac
|
|||||||
import com.knecon.fforesight.service.layoutparser.processor.services.blockification.DocstrumBlockificationService;
|
import com.knecon.fforesight.service.layoutparser.processor.services.blockification.DocstrumBlockificationService;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.services.blockification.DocuMineBlockificationService;
|
import com.knecon.fforesight.service.layoutparser.processor.services.blockification.DocuMineBlockificationService;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.services.blockification.RedactManagerBlockificationService;
|
import com.knecon.fforesight.service.layoutparser.processor.services.blockification.RedactManagerBlockificationService;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.services.classification.ClarifyndClassificationService;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.services.classification.DocuMineClassificationService;
|
import com.knecon.fforesight.service.layoutparser.processor.services.classification.DocuMineClassificationService;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.services.classification.RedactManagerClassificationService;
|
import com.knecon.fforesight.service.layoutparser.processor.services.classification.RedactManagerClassificationService;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.services.factory.DocumentGraphFactory;
|
import com.knecon.fforesight.service.layoutparser.processor.services.factory.DocumentGraphFactory;
|
||||||
@ -88,6 +89,7 @@ public class LayoutParsingPipeline {
|
|||||||
LayoutGridService layoutGridService;
|
LayoutGridService layoutGridService;
|
||||||
ObservationRegistry observationRegistry;
|
ObservationRegistry observationRegistry;
|
||||||
VisualLayoutParsingAdapter visualLayoutParsingAdapter;
|
VisualLayoutParsingAdapter visualLayoutParsingAdapter;
|
||||||
|
ClarifyndClassificationService clarifyndClassificationService;
|
||||||
|
|
||||||
|
|
||||||
public LayoutParsingFinishedEvent parseLayoutAndSaveFilesToStorage(LayoutParsingRequest layoutParsingRequest) throws IOException {
|
public LayoutParsingFinishedEvent parseLayoutAndSaveFilesToStorage(LayoutParsingRequest layoutParsingRequest) throws IOException {
|
||||||
@ -291,7 +293,7 @@ public class LayoutParsingPipeline {
|
|||||||
if (layoutParsingType == LayoutParsingType.REDACT_MANAGER) {
|
if (layoutParsingType == LayoutParsingType.REDACT_MANAGER) {
|
||||||
docstrumBlockificationService.combineBlocks(classificationPage);
|
docstrumBlockificationService.combineBlocks(classificationPage);
|
||||||
} else if (layoutParsingType == LayoutParsingType.CLARIFYND) {
|
} else if (layoutParsingType == LayoutParsingType.CLARIFYND) {
|
||||||
docstrumBlockificationService.mergeIntersectingBlocks(classificationPage.getTextBlocks());
|
docstrumBlockificationService.mergeIntersectingBlocks(classificationPage.getTextBlocks(), 0, 6.5f);
|
||||||
}
|
}
|
||||||
|
|
||||||
buildPageStatistics(classificationPage);
|
buildPageStatistics(classificationPage);
|
||||||
@ -306,9 +308,10 @@ public class LayoutParsingPipeline {
|
|||||||
bodyTextFrameService.setBodyTextFrames(classificationDocument, layoutParsingType);
|
bodyTextFrameService.setBodyTextFrames(classificationDocument, layoutParsingType);
|
||||||
log.info("Classify TextBlocks for {}", identifier);
|
log.info("Classify TextBlocks for {}", identifier);
|
||||||
switch (layoutParsingType) {
|
switch (layoutParsingType) {
|
||||||
case REDACT_MANAGER, REDACT_MANAGER_PARAGRAPH_DEBUG, REDACT_MANAGER_OLD, CLARIFYND, CLARIFYND_PARAGRAPH_DEBUG ->
|
case REDACT_MANAGER, REDACT_MANAGER_PARAGRAPH_DEBUG, REDACT_MANAGER_OLD, CLARIFYND_PARAGRAPH_DEBUG ->
|
||||||
redactManagerClassificationService.classifyDocument(classificationDocument);
|
redactManagerClassificationService.classifyDocument(classificationDocument);
|
||||||
case DOCUMINE -> docuMineClassificationService.classifyDocument(classificationDocument);
|
case DOCUMINE -> docuMineClassificationService.classifyDocument(classificationDocument);
|
||||||
|
case CLARIFYND -> clarifyndClassificationService.classifyDocument(classificationDocument);
|
||||||
}
|
}
|
||||||
|
|
||||||
log.info("Building Sections for {}", identifier);
|
log.info("Building Sections for {}", identifier);
|
||||||
|
|||||||
@ -80,7 +80,7 @@ public class DocstrumBlockificationService {
|
|||||||
|
|
||||||
public void combineBlocks(ClassificationPage page) {
|
public void combineBlocks(ClassificationPage page) {
|
||||||
|
|
||||||
mergeIntersectingBlocks(page.getTextBlocks());
|
mergeIntersectingBlocks(page.getTextBlocks(), 0, 0);
|
||||||
|
|
||||||
TextPageBlock previous = new TextPageBlock();
|
TextPageBlock previous = new TextPageBlock();
|
||||||
ListIterator<AbstractPageBlock> itty = page.getTextBlocks().listIterator();
|
ListIterator<AbstractPageBlock> itty = page.getTextBlocks().listIterator();
|
||||||
@ -123,7 +123,7 @@ public class DocstrumBlockificationService {
|
|||||||
previous = current;
|
previous = current;
|
||||||
}
|
}
|
||||||
|
|
||||||
mergeIntersectingBlocks(page.getTextBlocks());
|
mergeIntersectingBlocks(page.getTextBlocks(), 0, 0);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@ -203,7 +203,7 @@ public class DocstrumBlockificationService {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
public void mergeIntersectingBlocks(List<AbstractPageBlock> blocks) {
|
public void mergeIntersectingBlocks(List<AbstractPageBlock> blocks, float xThreshold, float yThreshold) {
|
||||||
|
|
||||||
ListIterator<AbstractPageBlock> itty = blocks.listIterator();
|
ListIterator<AbstractPageBlock> itty = blocks.listIterator();
|
||||||
Set<AbstractPageBlock> toRemove = new HashSet<>();
|
Set<AbstractPageBlock> toRemove = new HashSet<>();
|
||||||
@ -237,7 +237,7 @@ public class DocstrumBlockificationService {
|
|||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (current.getDir() == inner.getDir() && current.almostIntersects(inner, 0, 0)) {
|
if (current.getDir() == inner.getDir() && current.almostIntersects(inner, yThreshold, xThreshold)) {
|
||||||
|
|
||||||
current.getSequences().addAll(inner.getSequences());
|
current.getSequences().addAll(inner.getSequences());
|
||||||
QuickSort.sort(current.getSequences(), new TextPositionSequenceComparator());
|
QuickSort.sort(current.getSequences(), new TextPositionSequenceComparator());
|
||||||
|
|||||||
@ -0,0 +1,114 @@
|
|||||||
|
package com.knecon.fforesight.service.layoutparser.processor.services.classification;
|
||||||
|
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.regex.Pattern;
|
||||||
|
|
||||||
|
import org.springframework.stereotype.Service;
|
||||||
|
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationDocument;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.model.PageBlockType;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.utils.MarkedContentUtils;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.utils.PositionUtils;
|
||||||
|
|
||||||
|
import lombok.RequiredArgsConstructor;
|
||||||
|
import lombok.extern.slf4j.Slf4j;
|
||||||
|
|
||||||
|
@Slf4j
|
||||||
|
@Service
|
||||||
|
@RequiredArgsConstructor
|
||||||
|
public class ClarifyndClassificationService {
|
||||||
|
|
||||||
|
public void classifyDocument(ClassificationDocument document) {
|
||||||
|
|
||||||
|
List<Float> headlineFontSizes = document.getFontSizeCounter().getHighterThanMostPopular();
|
||||||
|
|
||||||
|
log.debug("Document FontSize counters are: {}", document.getFontSizeCounter().getCountPerValue());
|
||||||
|
|
||||||
|
for (ClassificationPage page : document.getPages()) {
|
||||||
|
classifyPage(page, document, headlineFontSizes);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private void classifyPage(ClassificationPage page, ClassificationDocument document, List<Float> headlineFontSizes) {
|
||||||
|
|
||||||
|
for (AbstractPageBlock textBlock : page.getTextBlocks()) {
|
||||||
|
if (textBlock instanceof TextPageBlock) {
|
||||||
|
classifyBlock((TextPageBlock) textBlock, page, document, headlineFontSizes);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private void classifyBlock(TextPageBlock textBlock, ClassificationPage page, ClassificationDocument document, List<Float> headlineFontSizes) {
|
||||||
|
|
||||||
|
var bodyTextFrame = page.getBodyTextFrame();
|
||||||
|
|
||||||
|
if (document.getFontSizeCounter().getMostPopular() == null) {
|
||||||
|
textBlock.setClassification(PageBlockType.PARAGRAPH);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
if (MarkedContentUtils.intersects(textBlock, page.getMarkedContentBboxPerType(), MarkedContentUtils.HEADER) || PositionUtils.isOverBodyTextFrame(bodyTextFrame,
|
||||||
|
textBlock,
|
||||||
|
page.getRotation()) && (document.getFontSizeCounter().getMostPopular() == null || textBlock.getHighestFontSize() <= document.getFontSizeCounter()
|
||||||
|
.getMostPopular())) {
|
||||||
|
textBlock.setClassification(PageBlockType.PARAGRAPH);
|
||||||
|
|
||||||
|
} else if (MarkedContentUtils.intersects(textBlock, page.getMarkedContentBboxPerType(), MarkedContentUtils.FOOTER) || PositionUtils.isUnderBodyTextFrame(bodyTextFrame,
|
||||||
|
textBlock,
|
||||||
|
page.getRotation()) && (document.getFontSizeCounter().getMostPopular() == null || textBlock.getHighestFontSize() <= document.getFontSizeCounter()
|
||||||
|
.getMostPopular())) {
|
||||||
|
textBlock.setClassification(PageBlockType.PARAGRAPH);
|
||||||
|
} else if (page.getPageNumber() == 1 && (PositionUtils.getHeightDifferenceBetweenChunkWordAndDocumentWord(textBlock,
|
||||||
|
document.getTextHeightCounter().getMostPopular()) > 2.5 && textBlock.getHighestFontSize() > document.getFontSizeCounter().getMostPopular() || page.getTextBlocks()
|
||||||
|
.size() == 1)) {
|
||||||
|
if (!Pattern.matches("[0-9]+", textBlock.toString())) {
|
||||||
|
textBlock.setClassification(PageBlockType.TITLE);
|
||||||
|
}
|
||||||
|
} else if (textBlock.getMostPopularWordFontSize() > document.getFontSizeCounter()
|
||||||
|
.getMostPopular() && PositionUtils.getApproxLineCount(textBlock) < 4.9 && (textBlock.getMostPopularWordStyle().equals("bold") || !document.getFontStyleCounter()
|
||||||
|
.getCountPerValue()
|
||||||
|
.containsKey("bold") && textBlock.getMostPopularWordFontSize() > document.getFontSizeCounter().getMostPopular() + 1) && textBlock.getSequences()
|
||||||
|
.get(0)
|
||||||
|
.getTextPositions()
|
||||||
|
.get(0)
|
||||||
|
.getFontSizeInPt() >= textBlock.getMostPopularWordFontSize()) {
|
||||||
|
|
||||||
|
for (int i = 1; i <= headlineFontSizes.size(); i++) {
|
||||||
|
if (textBlock.getMostPopularWordFontSize() == headlineFontSizes.get(i - 1)) {
|
||||||
|
textBlock.setClassification(PageBlockType.getHeadlineType(i));
|
||||||
|
document.setHeadlines(true);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} else if (!textBlock.getText().startsWith("Figure ") && PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock) && textBlock.getMostPopularWordStyle()
|
||||||
|
.equals("bold") && !document.getFontStyleCounter().getMostPopular().equals("bold") && PositionUtils.getApproxLineCount(textBlock) < 2.9 && textBlock.getSequences()
|
||||||
|
.get(0)
|
||||||
|
.getTextPositions()
|
||||||
|
.get(0)
|
||||||
|
.getFontSizeInPt() >= textBlock.getMostPopularWordFontSize()) {
|
||||||
|
textBlock.setClassification(PageBlockType.getHeadlineType(headlineFontSizes.size() + 1));
|
||||||
|
document.setHeadlines(true);
|
||||||
|
} else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock) && textBlock.getMostPopularWordFontSize() == document.getFontSizeCounter()
|
||||||
|
.getMostPopular() && textBlock.getMostPopularWordStyle().equals("bold") && !document.getFontStyleCounter().getMostPopular().equals("bold")) {
|
||||||
|
textBlock.setClassification(PageBlockType.PARAGRAPH_BOLD);
|
||||||
|
} else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock) && textBlock.getMostPopularWordFont()
|
||||||
|
.equals(document.getFontCounter().getMostPopular()) && textBlock.getMostPopularWordStyle()
|
||||||
|
.equals(document.getFontStyleCounter().getMostPopular()) && textBlock.getMostPopularWordFontSize() == document.getFontSizeCounter().getMostPopular()) {
|
||||||
|
textBlock.setClassification(PageBlockType.PARAGRAPH);
|
||||||
|
} else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock) && textBlock.getMostPopularWordFontSize() == document.getFontSizeCounter()
|
||||||
|
.getMostPopular() && textBlock.getMostPopularWordStyle().equals("italic") && !document.getFontStyleCounter()
|
||||||
|
.getMostPopular()
|
||||||
|
.equals("italic") && PositionUtils.getApproxLineCount(textBlock) < 2.9) {
|
||||||
|
textBlock.setClassification(PageBlockType.PARAGRAPH_ITALIC);
|
||||||
|
} else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock)) {
|
||||||
|
textBlock.setClassification(PageBlockType.PARAGRAPH_UNKNOWN);
|
||||||
|
} else {
|
||||||
|
textBlock.setClassification(PageBlockType.PARAGRAPH);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
@ -27,7 +27,7 @@ public class ViewerDocumentTest extends BuildDocumentTest {
|
|||||||
@SneakyThrows
|
@SneakyThrows
|
||||||
public void testViewerDocument() {
|
public void testViewerDocument() {
|
||||||
|
|
||||||
String fileName = "files/S-Metolachlor_RAR_01_Volume_1_2018-09-06.pdf";
|
String fileName = "files/bdr/notMergedParagraphs.pdf";
|
||||||
String tmpFileName = "/tmp/" + Path.of(fileName).getFileName() + "_VIEWER.pdf";
|
String tmpFileName = "/tmp/" + Path.of(fileName).getFileName() + "_VIEWER.pdf";
|
||||||
|
|
||||||
var documentFile = new ClassPathResource(fileName).getFile();
|
var documentFile = new ClassPathResource(fileName).getFile();
|
||||||
@ -35,7 +35,7 @@ public class ViewerDocumentTest extends BuildDocumentTest {
|
|||||||
LayoutGridService layoutGridService = new LayoutGridService(viewerDocumentService);
|
LayoutGridService layoutGridService = new LayoutGridService(viewerDocumentService);
|
||||||
|
|
||||||
long start = System.currentTimeMillis();
|
long start = System.currentTimeMillis();
|
||||||
Document document = buildGraph(fileName, LayoutParsingType.REDACT_MANAGER);
|
Document document = buildGraph(fileName, LayoutParsingType.CLARIFYND);
|
||||||
layoutGridService.addLayoutGrid(documentFile, document, new File(tmpFileName), true);
|
layoutGridService.addLayoutGrid(documentFile, document, new File(tmpFileName), true);
|
||||||
System.out.printf("Total time: %.2fs%n", ((float) (System.currentTimeMillis() - start)) / 1000);
|
System.out.printf("Total time: %.2fs%n", ((float) (System.currentTimeMillis() - start)) / 1000);
|
||||||
}
|
}
|
||||||
|
|||||||
Binary file not shown.
Loading…
x
Reference in New Issue
Block a user