DM-307: Enabled to configure DocuMine Paragraph classifications #18

Merged
dominique.eiflaender1 merged 1 commits from DM-307 into master 2023-06-27 11:57:30 +02:00
8 changed files with 247 additions and 112 deletions

View File

@ -1,115 +1,7 @@
package com.iqser.red.service.redaction.v1.server.layoutparsing.classification.service;
import java.util.List;
import java.util.regex.Pattern;
import org.springframework.stereotype.Service;
import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.Rectangle;
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.AbstractPageBlock;
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.ClassificationDocument;
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.ClassificationPage;
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.PageBlockType;
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.text.TextPageBlock;
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.utils.PositionUtils;
import lombok.RequiredArgsConstructor;
import lombok.extern.slf4j.Slf4j;
@Slf4j
@Service
@RequiredArgsConstructor
public class ClassificationService {
private final BodyTextFrameService bodyTextFrameService;
public void classifyDocument(ClassificationDocument document) {
Rectangle bodyTextFrame = bodyTextFrameService.calculateBodyTextFrame(document.getPages(), document.getFontSizeCounter(), false);
Rectangle landscapeBodyTextFrame = bodyTextFrameService.calculateBodyTextFrame(document.getPages(), document.getFontSizeCounter(), true);
List<Float> headlineFontSizes = document.getFontSizeCounter().getHighterThanMostPopular();
log.debug("Document FontSize counters are: {}", document.getFontSizeCounter().getCountPerValue());
for (ClassificationPage page : document.getPages()) {
bodyTextFrameService.setBodyTextFrameAdjustedToPage(page, bodyTextFrame, landscapeBodyTextFrame);
classifyPage(page, document, headlineFontSizes);
}
}
public void classifyPage(ClassificationPage page, ClassificationDocument document, List<Float> headlineFontSizes) {
for (AbstractPageBlock textBlock : page.getTextBlocks()) {
if (textBlock instanceof TextPageBlock) {
classifyBlock((TextPageBlock) textBlock, page, document, headlineFontSizes);
}
}
}
public void classifyBlock(TextPageBlock textBlock, ClassificationPage page, ClassificationDocument document, List<Float> headlineFontSizes) {
var bodyTextFrame = page.getBodyTextFrame();
if (document.getFontSizeCounter().getMostPopular() == null) {
textBlock.setClassification(PageBlockType.OTHER);
return;
}
if (PositionUtils.isOverBodyTextFrame(bodyTextFrame, textBlock, page.getRotation()) && (document.getFontSizeCounter()
.getMostPopular() == null || textBlock.getHighestFontSize() <= document.getFontSizeCounter().getMostPopular())) {
textBlock.setClassification(PageBlockType.HEADER);
} else if (PositionUtils.isUnderBodyTextFrame(bodyTextFrame, textBlock, page.getRotation()) && (document.getFontSizeCounter()
.getMostPopular() == null || textBlock.getHighestFontSize() <= document.getFontSizeCounter().getMostPopular())) {
textBlock.setClassification(PageBlockType.FOOTER);
} else if (page.getPageNumber() == 1 && (PositionUtils.getHeightDifferenceBetweenChunkWordAndDocumentWord(textBlock,
document.getTextHeightCounter().getMostPopular()) > 2.5 && textBlock.getHighestFontSize() > document.getFontSizeCounter().getMostPopular() || page.getTextBlocks()
.size() == 1)) {
if (!Pattern.matches("[0-9]+", textBlock.toString())) {
textBlock.setClassification(PageBlockType.TITLE);
}
} else if (textBlock.getMostPopularWordFontSize() > document.getFontSizeCounter()
.getMostPopular() && PositionUtils.getApproxLineCount(textBlock) < 4.9 && (textBlock.getMostPopularWordStyle().equals("bold") || !document.getFontStyleCounter()
.getCountPerValue()
.containsKey("bold") && textBlock.getMostPopularWordFontSize() > document.getFontSizeCounter().getMostPopular() + 1) && textBlock.getSequences()
.get(0)
.getTextPositions()
.get(0)
.getFontSizeInPt() >= textBlock.getMostPopularWordFontSize()) {
for (int i = 1; i <= headlineFontSizes.size(); i++) {
if (textBlock.getMostPopularWordFontSize() == headlineFontSizes.get(i - 1)) {
textBlock.setClassification(PageBlockType.getHeadlineType(i));
document.setHeadlines(true);
}
}
} else if (!textBlock.getText().startsWith("Figure ") && PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock) && textBlock.getMostPopularWordStyle()
.equals("bold") && !document.getFontStyleCounter().getMostPopular().equals("bold") && PositionUtils.getApproxLineCount(textBlock) < 2.9 && textBlock.getSequences()
.get(0)
.getTextPositions()
.get(0)
.getFontSizeInPt() >= textBlock.getMostPopularWordFontSize()) {
textBlock.setClassification(PageBlockType.getHeadlineType(headlineFontSizes.size() + 1));
document.setHeadlines(true);
} else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock) && textBlock.getMostPopularWordFontSize() == document.getFontSizeCounter()
.getMostPopular() && textBlock.getMostPopularWordStyle().equals("bold") && !document.getFontStyleCounter().getMostPopular().equals("bold")) {
textBlock.setClassification(PageBlockType.PARAGRAPH_BOLD);
} else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock) && textBlock.getMostPopularWordFont()
.equals(document.getFontCounter().getMostPopular()) && textBlock.getMostPopularWordStyle()
.equals(document.getFontStyleCounter().getMostPopular()) && textBlock.getMostPopularWordFontSize() == document.getFontSizeCounter().getMostPopular()) {
textBlock.setClassification(PageBlockType.PARAGRAPH);
} else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock) && textBlock.getMostPopularWordFontSize() == document.getFontSizeCounter()
.getMostPopular() && textBlock.getMostPopularWordStyle().equals("italic") && !document.getFontStyleCounter()
.getMostPopular()
.equals("italic") && PositionUtils.getApproxLineCount(textBlock) < 2.9) {
textBlock.setClassification(PageBlockType.PARAGRAPH_ITALIC);
} else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock)) {
textBlock.setClassification(PageBlockType.PARAGRAPH_UNKNOWN);
} else {
textBlock.setClassification(PageBlockType.OTHER);
}
}
public interface ClassificationService {
void classifyDocument(ClassificationDocument document);
}

View File

@ -0,0 +1,117 @@
package com.iqser.red.service.redaction.v1.server.layoutparsing.classification.service;
import java.util.List;
import java.util.Locale;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.springframework.boot.autoconfigure.condition.ConditionalOnProperty;
import org.springframework.stereotype.Service;
import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.Rectangle;
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.AbstractPageBlock;
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.ClassificationDocument;
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.ClassificationPage;
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.PageBlockType;
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.text.TextPageBlock;
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.utils.PositionUtils;
import com.iqser.red.service.redaction.v1.server.redaction.utils.Patterns;
import lombok.RequiredArgsConstructor;
import lombok.extern.slf4j.Slf4j;
@Slf4j
@Service
@RequiredArgsConstructor
@ConditionalOnProperty(prefix = "application", name = "type", havingValue = "DocuMine")
public class DocuMineClassificationService implements ClassificationService{
private final BodyTextFrameService bodyTextFrameService;
public void classifyDocument(ClassificationDocument document) {
Rectangle bodyTextFrame = bodyTextFrameService.calculateBodyTextFrame(document.getPages(), document.getFontSizeCounter(), false);
Rectangle landscapeBodyTextFrame = bodyTextFrameService.calculateBodyTextFrame(document.getPages(), document.getFontSizeCounter(), true);
List<Float> headlineFontSizes = document.getFontSizeCounter().getHighterThanMostPopular();
log.debug("Document FontSize counters are: {}", document.getFontSizeCounter().getCountPerValue());
for (ClassificationPage page : document.getPages()) {
bodyTextFrameService.setBodyTextFrameAdjustedToPage(page, bodyTextFrame, landscapeBodyTextFrame);
classifyPage(page, document, headlineFontSizes);
}
}
private void classifyPage(ClassificationPage page, ClassificationDocument document, List<Float> headlineFontSizes) {
for (AbstractPageBlock textBlock : page.getTextBlocks()) {
if (textBlock instanceof TextPageBlock) {
classifyBlock((TextPageBlock) textBlock, page, document, headlineFontSizes);
}
}
}
private void classifyBlock(TextPageBlock textBlock, ClassificationPage page, ClassificationDocument document, List<Float> headlineFontSizes) {
var bodyTextFrame = page.getBodyTextFrame();
var pattern = Patterns.getCompiledPattern("^(\\d{1,1}\\.){1,3}\\d{1,2}\\.?\\s[0-9A-Za-z]{2,50}", true);
var pattern2 = Patterns.getCompiledPattern(".*\\d$", true);
var pattern3 = Patterns.getCompiledPattern("^(\\d{1,1}\\.){1,3}\\d{1,2}\\.?\\s[a-z]{1,2}\\/[a-z]{1,2}.*", false);
Matcher matcher = pattern.matcher(textBlock.toString());
Matcher matcher2 = pattern2.matcher(textBlock.toString());
Matcher matcher3 = pattern3.matcher(textBlock.toString());
if (document.getFontSizeCounter().getMostPopular() == null) {
textBlock.setClassification(PageBlockType.OTHER);
return;
}
if (textBlock.getText().length() > 5 && (textBlock.getMostPopularWordHeight() > document.getTextHeightCounter()
.getMostPopular() || textBlock.getMostPopularWordFontSize() > document.getFontSizeCounter().getMostPopular()) && PositionUtils.getApproxLineCount(textBlock) < 5.9
&& (textBlock.getMostPopularWordStyle().contains("bold") && Character.isDigit(textBlock.toString().charAt(0)) && !matcher2.matches() && !textBlock.toString()
.contains(":") || textBlock.toString().equals(textBlock.toString().toUpperCase(Locale.ROOT)) && !matcher2.matches() && !textBlock.toString()
.contains(":") || textBlock.toString().startsWith("APPENDIX") || textBlock.toString().startsWith("TABLE")) && !textBlock.toString().endsWith(":")) {
textBlock.setClassification(PageBlockType.getHeadlineType(1));
document.setHeadlines(true);
} else if (matcher.find() && PositionUtils.getApproxLineCount(textBlock) < 2.9 && !matcher2.matches() && !matcher3.matches()) {
textBlock.setClassification(PageBlockType.getHeadlineType(2));
document.setHeadlines(true);
} else if (PositionUtils.isOverBodyTextFrame(bodyTextFrame, textBlock, page.getRotation()) && (document.getFontSizeCounter()
.getMostPopular() == null || textBlock.getHighestFontSize() <= document.getFontSizeCounter().getMostPopular())) {
textBlock.setClassification(PageBlockType.HEADER);
} else if (PositionUtils.isUnderBodyTextFrame(bodyTextFrame, textBlock, page.getRotation()) && (document.getFontSizeCounter()
.getMostPopular() == null || textBlock.getHighestFontSize() <= document.getFontSizeCounter().getMostPopular())) {
textBlock.setClassification(PageBlockType.FOOTER);
} else if (page.getPageNumber() == 1 && (PositionUtils.getHeightDifferenceBetweenChunkWordAndDocumentWord(textBlock,
document.getTextHeightCounter().getMostPopular()) > 2.5 && textBlock.getHighestFontSize() > document.getFontSizeCounter().getMostPopular() || page.getTextBlocks()
.size() == 1)) {
if (!Pattern.matches("[0-9]+", textBlock.toString())) {
textBlock.setClassification(PageBlockType.TITLE);
}
} else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock) && textBlock.getMostPopularWordFontSize() == document.getFontSizeCounter()
.getMostPopular() && textBlock.getMostPopularWordStyle().equals("bold") && !document.getFontStyleCounter().getMostPopular().equals("bold")) {
textBlock.setClassification(PageBlockType.PARAGRAPH_BOLD);
} else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock) && textBlock.getMostPopularWordFont()
.equals(document.getFontCounter().getMostPopular()) && textBlock.getMostPopularWordStyle()
.equals(document.getFontStyleCounter().getMostPopular()) && textBlock.getMostPopularWordFontSize() == document.getFontSizeCounter().getMostPopular()) {
textBlock.setClassification(PageBlockType.PARAGRAPH);
} else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock) && textBlock.getMostPopularWordFontSize() == document.getFontSizeCounter()
.getMostPopular() && textBlock.getMostPopularWordStyle().equals("italic") && !document.getFontStyleCounter()
.getMostPopular()
.equals("italic") && PositionUtils.getApproxLineCount(textBlock) < 2.9) {
textBlock.setClassification(PageBlockType.PARAGRAPH_ITALIC);
} else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock)) {
textBlock.setClassification(PageBlockType.PARAGRAPH_UNKNOWN);
} else {
textBlock.setClassification(PageBlockType.OTHER);
}
}
}

View File

@ -0,0 +1,117 @@
package com.iqser.red.service.redaction.v1.server.layoutparsing.classification.service;
import java.util.List;
import java.util.regex.Pattern;
import org.springframework.boot.autoconfigure.condition.ConditionalOnProperty;
import org.springframework.stereotype.Service;
import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.Rectangle;
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.AbstractPageBlock;
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.ClassificationDocument;
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.ClassificationPage;
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.PageBlockType;
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.model.text.TextPageBlock;
import com.iqser.red.service.redaction.v1.server.layoutparsing.classification.utils.PositionUtils;
import lombok.RequiredArgsConstructor;
import lombok.extern.slf4j.Slf4j;
@Slf4j
@Service
@RequiredArgsConstructor
@ConditionalOnProperty(prefix = "application", name = "type", havingValue = "RedactManager")
public class RedactManagerClassificationService implements ClassificationService {
private final BodyTextFrameService bodyTextFrameService;
public void classifyDocument(ClassificationDocument document) {
Rectangle bodyTextFrame = bodyTextFrameService.calculateBodyTextFrame(document.getPages(), document.getFontSizeCounter(), false);
Rectangle landscapeBodyTextFrame = bodyTextFrameService.calculateBodyTextFrame(document.getPages(), document.getFontSizeCounter(), true);
List<Float> headlineFontSizes = document.getFontSizeCounter().getHighterThanMostPopular();
log.debug("Document FontSize counters are: {}", document.getFontSizeCounter().getCountPerValue());
for (ClassificationPage page : document.getPages()) {
bodyTextFrameService.setBodyTextFrameAdjustedToPage(page, bodyTextFrame, landscapeBodyTextFrame);
classifyPage(page, document, headlineFontSizes);
}
}
private void classifyPage(ClassificationPage page, ClassificationDocument document, List<Float> headlineFontSizes) {
for (AbstractPageBlock textBlock : page.getTextBlocks()) {
if (textBlock instanceof TextPageBlock) {
classifyBlock((TextPageBlock) textBlock, page, document, headlineFontSizes);
}
}
}
private void classifyBlock(TextPageBlock textBlock, ClassificationPage page, ClassificationDocument document, List<Float> headlineFontSizes) {
var bodyTextFrame = page.getBodyTextFrame();
if (document.getFontSizeCounter().getMostPopular() == null) {
textBlock.setClassification(PageBlockType.OTHER);
return;
}
if (PositionUtils.isOverBodyTextFrame(bodyTextFrame, textBlock, page.getRotation()) && (document.getFontSizeCounter()
.getMostPopular() == null || textBlock.getHighestFontSize() <= document.getFontSizeCounter().getMostPopular())) {
textBlock.setClassification(PageBlockType.HEADER);
} else if (PositionUtils.isUnderBodyTextFrame(bodyTextFrame, textBlock, page.getRotation()) && (document.getFontSizeCounter()
.getMostPopular() == null || textBlock.getHighestFontSize() <= document.getFontSizeCounter().getMostPopular())) {
textBlock.setClassification(PageBlockType.FOOTER);
} else if (page.getPageNumber() == 1 && (PositionUtils.getHeightDifferenceBetweenChunkWordAndDocumentWord(textBlock,
document.getTextHeightCounter().getMostPopular()) > 2.5 && textBlock.getHighestFontSize() > document.getFontSizeCounter().getMostPopular() || page.getTextBlocks()
.size() == 1)) {
if (!Pattern.matches("[0-9]+", textBlock.toString())) {
textBlock.setClassification(PageBlockType.TITLE);
}
} else if (textBlock.getMostPopularWordFontSize() > document.getFontSizeCounter()
.getMostPopular() && PositionUtils.getApproxLineCount(textBlock) < 4.9 && (textBlock.getMostPopularWordStyle().equals("bold") || !document.getFontStyleCounter()
.getCountPerValue()
.containsKey("bold") && textBlock.getMostPopularWordFontSize() > document.getFontSizeCounter().getMostPopular() + 1) && textBlock.getSequences()
.get(0)
.getTextPositions()
.get(0)
.getFontSizeInPt() >= textBlock.getMostPopularWordFontSize()) {
for (int i = 1; i <= headlineFontSizes.size(); i++) {
if (textBlock.getMostPopularWordFontSize() == headlineFontSizes.get(i - 1)) {
textBlock.setClassification(PageBlockType.getHeadlineType(i));
document.setHeadlines(true);
}
}
} else if (!textBlock.getText().startsWith("Figure ") && PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock) && textBlock.getMostPopularWordStyle()
.equals("bold") && !document.getFontStyleCounter().getMostPopular().equals("bold") && PositionUtils.getApproxLineCount(textBlock) < 2.9 && textBlock.getSequences()
.get(0)
.getTextPositions()
.get(0)
.getFontSizeInPt() >= textBlock.getMostPopularWordFontSize()) {
textBlock.setClassification(PageBlockType.getHeadlineType(headlineFontSizes.size() + 1));
document.setHeadlines(true);
} else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock) && textBlock.getMostPopularWordFontSize() == document.getFontSizeCounter()
.getMostPopular() && textBlock.getMostPopularWordStyle().equals("bold") && !document.getFontStyleCounter().getMostPopular().equals("bold")) {
textBlock.setClassification(PageBlockType.PARAGRAPH_BOLD);
} else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock) && textBlock.getMostPopularWordFont()
.equals(document.getFontCounter().getMostPopular()) && textBlock.getMostPopularWordStyle()
.equals(document.getFontStyleCounter().getMostPopular()) && textBlock.getMostPopularWordFontSize() == document.getFontSizeCounter().getMostPopular()) {
textBlock.setClassification(PageBlockType.PARAGRAPH);
} else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock) && textBlock.getMostPopularWordFontSize() == document.getFontSizeCounter()
.getMostPopular() && textBlock.getMostPopularWordStyle().equals("italic") && !document.getFontStyleCounter()
.getMostPopular()
.equals("italic") && PositionUtils.getApproxLineCount(textBlock) < 2.9) {
textBlock.setClassification(PageBlockType.PARAGRAPH_ITALIC);
} else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock)) {
textBlock.setClassification(PageBlockType.PARAGRAPH_UNKNOWN);
} else {
textBlock.setClassification(PageBlockType.OTHER);
}
}
}

View File

@ -16,3 +16,6 @@ redaction-service:
cvTableParsingEnabled: false
nerServiceEnabled: false
priorityMode: false
application:
type: "RedactManager"

View File

@ -41,3 +41,6 @@ management:
storage:
backend: 's3'
application:
type: "RedactManager"

View File

@ -33,7 +33,7 @@ import com.iqser.red.storage.commons.service.StorageService;
import com.knecon.fforesight.tenantcommons.TenantContext;
@ExtendWith(SpringExtension.class)
@SpringBootTest(classes = Application.class, webEnvironment = SpringBootTest.WebEnvironment.RANDOM_PORT)
@SpringBootTest(classes = Application.class, webEnvironment = SpringBootTest.WebEnvironment.RANDOM_PORT, properties = {"application.type=DocuMine"})
@Import(DocumineFloraTest.RedactionIntegrationTestConfiguration.class)
public class DocumineFloraTest extends AbstractRedactionIntegrationTest {

View File

@ -20,6 +20,9 @@ redaction-service:
enable-image-classification: false
enable-entity-recognition: true
application:
type: "RedactManager"
storage:
backend: 's3'

View File

@ -57,7 +57,7 @@ rule "H.0.0: Show headlines"
when
$headline: Headline()
then
// entityCreationService.bySemanticNode($headline, "headline", EntityType.RECOMMENDATION);
entityCreationService.bySemanticNode($headline, "headline", EntityType.RECOMMENDATION);
end