add LayoutParsingType to BodyTextFrameService
This commit is contained in:
parent
65ab5eca22
commit
a41c13fdd6
@ -28,6 +28,18 @@ public class Ruling extends Line2D.Float {
|
||||
super(p1, p2);
|
||||
}
|
||||
|
||||
public Ruling straightenVertical() {
|
||||
|
||||
double x = (getX1() + getX2()) / 2;
|
||||
return new Ruling(new Point2D.Double(x, getY1()), new Point2D.Double(x, getY2()));
|
||||
}
|
||||
|
||||
public Ruling straightenHorizonatl() {
|
||||
|
||||
double y = (getY1() + getY2()) / 2;
|
||||
return new Ruling(new Point2D.Double(getX1(), y), new Point2D.Double(getX2(), y));
|
||||
}
|
||||
|
||||
|
||||
public static List<Ruling> cropRulingsToArea(List<Ruling> rulings, Rectangle2D area) {
|
||||
|
||||
|
||||
@ -6,6 +6,7 @@ import org.springframework.stereotype.Service;
|
||||
|
||||
import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.Point;
|
||||
import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.Rectangle;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.FloatFrequencyCounter;
|
||||
@ -17,7 +18,7 @@ import com.knecon.fforesight.service.layoutparser.processor.utils.PositionUtils;
|
||||
@Service
|
||||
public class BodyTextFrameService {
|
||||
|
||||
private static final float APPROXIMATE_HEADER_LINE_COUNT = 2.0f;
|
||||
|
||||
|
||||
|
||||
/**
|
||||
@ -68,7 +69,14 @@ public class BodyTextFrameService {
|
||||
* @param landscape Calculate for landscape or portrait
|
||||
* @return Rectangle of the text frame
|
||||
*/
|
||||
public Rectangle calculateBodyTextFrame(List<ClassificationPage> pages, FloatFrequencyCounter documentFontSizeCounter, boolean landscape) {
|
||||
public Rectangle calculateBodyTextFrame(List<ClassificationPage> pages, FloatFrequencyCounter documentFontSizeCounter, boolean landscape, LayoutParsingType layoutParsingType) {
|
||||
|
||||
float approximateHeaderLineCount;
|
||||
if (layoutParsingType.equals(LayoutParsingType.TAAS)) {
|
||||
approximateHeaderLineCount = 3.3f;
|
||||
} else {
|
||||
approximateHeaderLineCount = 2.9f;
|
||||
}
|
||||
|
||||
BodyTextFrameExpansionsRectangle expansionsRectangle = new BodyTextFrameExpansionsRectangle();
|
||||
|
||||
@ -87,7 +95,8 @@ public class BodyTextFrameService {
|
||||
}
|
||||
|
||||
float approxLineCount = PositionUtils.getApproxLineCount(textBlock);
|
||||
if (approxLineCount < APPROXIMATE_HEADER_LINE_COUNT) {
|
||||
if (layoutParsingType.equals(LayoutParsingType.DOCUMINE) && approxLineCount < approximateHeaderLineCount && textBlock.getMaxY() >= page.getPageHeight() - (page.getPageHeight() / 10)
|
||||
|| !layoutParsingType.equals(LayoutParsingType.DOCUMINE) && approxLineCount < approximateHeaderLineCount){
|
||||
continue;
|
||||
}
|
||||
|
||||
|
||||
@ -8,6 +8,7 @@ import java.util.regex.Pattern;
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.Rectangle;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationDocument;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
|
||||
@ -33,8 +34,8 @@ public class DocuMineClassificationService {
|
||||
|
||||
public void classifyDocument(ClassificationDocument document) {
|
||||
|
||||
Rectangle bodyTextFrame = bodyTextFrameService.calculateBodyTextFrame(document.getPages(), document.getFontSizeCounter(), false);
|
||||
Rectangle landscapeBodyTextFrame = bodyTextFrameService.calculateBodyTextFrame(document.getPages(), document.getFontSizeCounter(), true);
|
||||
Rectangle bodyTextFrame = bodyTextFrameService.calculateBodyTextFrame(document.getPages(), document.getFontSizeCounter(), false, LayoutParsingType.DOCUMINE);
|
||||
Rectangle landscapeBodyTextFrame = bodyTextFrameService.calculateBodyTextFrame(document.getPages(), document.getFontSizeCounter(), true, LayoutParsingType.DOCUMINE);
|
||||
List<Float> headlineFontSizes = document.getFontSizeCounter().getHighterThanMostPopular();
|
||||
|
||||
log.debug("Document FontSize counters are: {}", document.getFontSizeCounter().getCountPerValue());
|
||||
@ -69,20 +70,7 @@ public class DocuMineClassificationService {
|
||||
textBlock.setClassification(PageBlockType.OTHER);
|
||||
return;
|
||||
}
|
||||
if (textBlock.getText().length() > 6 && (textBlock.getMostPopularWordHeight() > document.getTextHeightCounter()
|
||||
.getMostPopular() || textBlock.getMostPopularWordFontSize() > document.getFontSizeCounter().getMostPopular()) && PositionUtils.getApproxLineCount(textBlock) < 5.9
|
||||
|
||||
&& (textBlock.getMostPopularWordStyle().contains("bold") && Character.isDigit(textBlock.toString().charAt(0)) && !matcher2.matches() && !textBlock.toString()
|
||||
.contains(":") || textBlock.toString().equals(textBlock.toString().toUpperCase(Locale.ROOT)) && !matcher2.matches() && !textBlock.toString()
|
||||
.contains(":") || textBlock.toString().startsWith("APPENDIX") || textBlock.toString().startsWith("FIGURE") || textBlock.toString()
|
||||
.startsWith("TABLE")) && !textBlock.toString().endsWith(":")) {
|
||||
textBlock.setClassification(PageBlockType.getHeadlineType(1));
|
||||
document.setHeadlines(true);
|
||||
|
||||
} else if (matcher.find() && PositionUtils.getApproxLineCount(textBlock) < 2.9 && !matcher3.matches() && !matcher2.matches()) {
|
||||
textBlock.setClassification(PageBlockType.getHeadlineType(2));
|
||||
document.setHeadlines(true);
|
||||
} else if (PositionUtils.isOverBodyTextFrame(bodyTextFrame, textBlock, page.getRotation()) && (document.getFontSizeCounter()
|
||||
if (PositionUtils.isOverBodyTextFrame(bodyTextFrame, textBlock, page.getRotation()) && (document.getFontSizeCounter()
|
||||
.getMostPopular() == null || textBlock.getHighestFontSize() <= document.getFontSizeCounter().getMostPopular())) {
|
||||
textBlock.setClassification(PageBlockType.HEADER);
|
||||
|
||||
@ -95,6 +83,19 @@ public class DocuMineClassificationService {
|
||||
if (!Pattern.matches("[0-9]+", textBlock.toString())) {
|
||||
textBlock.setClassification(PageBlockType.TITLE);
|
||||
}
|
||||
} else if (textBlock.getText().length() > 5 && (textBlock.getMostPopularWordHeight() > document.getTextHeightCounter()
|
||||
.getMostPopular() || textBlock.getMostPopularWordFontSize() > document.getFontSizeCounter().getMostPopular()) && PositionUtils.getApproxLineCount(textBlock) < 5.9
|
||||
|
||||
&& (textBlock.getMostPopularWordStyle().contains("bold") && Character.isDigit(textBlock.toString().charAt(0)) && !matcher2.matches() && !textBlock.toString()
|
||||
.contains(":") || textBlock.toString().equals(textBlock.toString().toUpperCase(Locale.ROOT)) && !matcher2.matches() && !textBlock.toString()
|
||||
.contains(":") || textBlock.toString().startsWith("APPENDIX") || textBlock.toString().startsWith("FIGURE") || textBlock.toString()
|
||||
.startsWith("TABLE")) && !textBlock.toString().endsWith(":") && matcher2.find()) {
|
||||
textBlock.setClassification(PageBlockType.getHeadlineType(1));
|
||||
document.setHeadlines(true);
|
||||
|
||||
} else if (matcher.find() && PositionUtils.getApproxLineCount(textBlock) < 2.9 && matcher2.find() && !matcher3.matches()) {
|
||||
textBlock.setClassification(PageBlockType.getHeadlineType(2));
|
||||
document.setHeadlines(true);
|
||||
} else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock) && textBlock.getMostPopularWordFontSize() == document.getFontSizeCounter()
|
||||
.getMostPopular() && textBlock.getMostPopularWordStyle().equals("bold") && !document.getFontStyleCounter().getMostPopular().equals("bold")) {
|
||||
textBlock.setClassification(PageBlockType.PARAGRAPH_BOLD);
|
||||
|
||||
@ -6,6 +6,7 @@ import java.util.regex.Pattern;
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.Rectangle;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationDocument;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
|
||||
@ -27,8 +28,8 @@ public class RedactManagerClassificationService {
|
||||
|
||||
public void classifyDocument(ClassificationDocument document) {
|
||||
|
||||
Rectangle bodyTextFrame = bodyTextFrameService.calculateBodyTextFrame(document.getPages(), document.getFontSizeCounter(), false);
|
||||
Rectangle landscapeBodyTextFrame = bodyTextFrameService.calculateBodyTextFrame(document.getPages(), document.getFontSizeCounter(), true);
|
||||
Rectangle bodyTextFrame = bodyTextFrameService.calculateBodyTextFrame(document.getPages(), document.getFontSizeCounter(), false, LayoutParsingType.REDACT_MANAGER);
|
||||
Rectangle landscapeBodyTextFrame = bodyTextFrameService.calculateBodyTextFrame(document.getPages(), document.getFontSizeCounter(), true, LayoutParsingType.REDACT_MANAGER);
|
||||
List<Float> headlineFontSizes = document.getFontSizeCounter().getHighterThanMostPopular();
|
||||
|
||||
log.debug("Document FontSize counters are: {}", document.getFontSizeCounter().getCountPerValue());
|
||||
|
||||
@ -6,6 +6,7 @@ import java.util.regex.Pattern;
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.Rectangle;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationDocument;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
|
||||
@ -27,8 +28,8 @@ public class TaasClassificationService {
|
||||
|
||||
public void classifyDocument(ClassificationDocument document) {
|
||||
|
||||
Rectangle bodyTextFrame = bodyTextFrameService.calculateBodyTextFrame(document.getPages(), document.getFontSizeCounter(), false);
|
||||
Rectangle landscapeBodyTextFrame = bodyTextFrameService.calculateBodyTextFrame(document.getPages(), document.getFontSizeCounter(), true);
|
||||
Rectangle bodyTextFrame = bodyTextFrameService.calculateBodyTextFrame(document.getPages(), document.getFontSizeCounter(), false, LayoutParsingType.TAAS);
|
||||
Rectangle landscapeBodyTextFrame = bodyTextFrameService.calculateBodyTextFrame(document.getPages(), document.getFontSizeCounter(), true, LayoutParsingType.TAAS);
|
||||
List<Float> headlineFontSizes = document.getFontSizeCounter().getHighterThanMostPopular();
|
||||
|
||||
log.debug("Document FontSize counters are: {}", document.getFontSizeCounter().getCountPerValue());
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user