diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/classification/DocuMineClassificationService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/classification/DocuMineClassificationService.java index d0ee204..f10ac3b 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/classification/DocuMineClassificationService.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/classification/DocuMineClassificationService.java @@ -23,7 +23,7 @@ import lombok.extern.slf4j.Slf4j; @RequiredArgsConstructor public class DocuMineClassificationService { - private static final Pattern pattern = Pattern.compile("^(\\d{1,2}\\.){1,3}\\d{1,2}\\.?\\s[0-9A-Za-z \\[\\]]{2,50}", Pattern.CASE_INSENSITIVE); + private static final Pattern pattern = Pattern.compile("^([1-9]\\d?\\.){1,3}\\d{1,2}\\.?\\s[0-9A-Za-z \\[\\]]{2,50}", Pattern.CASE_INSENSITIVE); private static final Pattern pattern2 = Pattern.compile("\\p{L}{3,}", Pattern.CASE_INSENSITIVE); private static final Pattern pattern3 = Pattern.compile("^(\\d{1,1}\\.){1,3}\\d{1,2}\\.?\\s[a-z]{1,2}\\/[a-z]{1,2}.*"); @@ -63,46 +63,54 @@ public class DocuMineClassificationService { return; } if (MarkedContentUtils.intersects(textBlock, page.getMarkedContentBboxPerType(), MarkedContentUtils.HEADER) - || PositionUtils.isOverBodyTextFrame(bodyTextFrame, textBlock, page.getRotation()) && (document.getFontSizeCounter() - .getMostPopular() == null || textBlock.getHighestFontSize() <= document.getFontSizeCounter().getMostPopular()) - ) { + || PositionUtils.isOverBodyTextFrame(bodyTextFrame, textBlock, page.getRotation()) && (document.getFontSizeCounter().getMostPopular() == null + || textBlock.getHighestFontSize() <= document.getFontSizeCounter() + .getMostPopular())) { textBlock.setClassification(PageBlockType.HEADER); } else if (MarkedContentUtils.intersects(textBlock, page.getMarkedContentBboxPerType(), MarkedContentUtils.FOOTER) - || PositionUtils.isUnderBodyTextFrame(bodyTextFrame, textBlock, page.getRotation()) && (document.getFontSizeCounter() - .getMostPopular() == null || textBlock.getHighestFontSize() <= document.getFontSizeCounter().getMostPopular()) - ) { + || PositionUtils.isUnderBodyTextFrame(bodyTextFrame, textBlock, page.getRotation()) && (document.getFontSizeCounter().getMostPopular() == null + || textBlock.getHighestFontSize() <= document.getFontSizeCounter() + .getMostPopular())) { textBlock.setClassification(PageBlockType.FOOTER); - } else if (page.getPageNumber() == 1 && (PositionUtils.getHeightDifferenceBetweenChunkWordAndDocumentWord(textBlock, - document.getTextHeightCounter().getMostPopular()) > 2.5 && textBlock.getHighestFontSize() > document.getFontSizeCounter().getMostPopular() || page.getTextBlocks() - .size() == 1)) { + } else if (page.getPageNumber() == 1 && (PositionUtils.getHeightDifferenceBetweenChunkWordAndDocumentWord(textBlock, document.getTextHeightCounter().getMostPopular()) > 2.5 + && textBlock.getHighestFontSize() > document.getFontSizeCounter().getMostPopular() || page.getTextBlocks().size() == 1)) { if (!Pattern.matches("[0-9]+", textBlock.toString())) { textBlock.setClassification(PageBlockType.TITLE); } - } else if (textBlock.getText().length() > 5 && (textBlock.getMostPopularWordHeight() > document.getTextHeightCounter() - .getMostPopular() || textBlock.getMostPopularWordFontSize() > document.getFontSizeCounter().getMostPopular()) && PositionUtils.getApproxLineCount(textBlock) < 5.9 - - && (textBlock.getMostPopularWordStyle().contains("bold") && Character.isDigit(textBlock.toString().charAt(0)) && !matcher2.matches() && !textBlock.toString() - .contains(":") || textBlock.toString().equals(textBlock.toString().toUpperCase(Locale.ROOT)) && !matcher2.matches() && !textBlock.toString() - .contains(":") || textBlock.toString().startsWith("APPENDIX") || textBlock.toString().startsWith("FIGURE") || textBlock.toString() - .startsWith("TABLE")) && !textBlock.toString().endsWith(":") && matcher2.find()) { + } else if (textBlock.getText().length() > 5 + && (textBlock.getMostPopularWordHeight() > document.getTextHeightCounter().getMostPopular() + || textBlock.getMostPopularWordFontSize() > document.getFontSizeCounter().getMostPopular()) + && PositionUtils.getApproxLineCount(textBlock) < 5.9 + && (textBlock.getMostPopularWordStyle().contains("bold") && Character.isDigit(textBlock.toString().charAt(0)) && matcher2.reset().find() && !textBlock.toString() + .contains(":") + || textBlock.toString().equals(textBlock.toString().toUpperCase(Locale.ROOT)) && matcher2.reset().find() && !textBlock.toString().contains(":") + || textBlock.toString().startsWith("APPENDIX") + || textBlock.toString().startsWith("FIGURE") + || textBlock.toString().startsWith("TABLE")) + && !textBlock.toString().endsWith(":") + && matcher2.reset().find()) { textBlock.setClassification(PageBlockType.getHeadlineType(1)); document.setHeadlines(true); - } else if (matcher.find() && PositionUtils.getApproxLineCount(textBlock) < 2.9 && matcher2.find() && !matcher3.matches()) { + } else if (matcher.reset().find() && PositionUtils.getApproxLineCount(textBlock) < 2.9 && matcher2.reset().find() && !matcher3.reset().matches()) { textBlock.setClassification(PageBlockType.getHeadlineType(2)); document.setHeadlines(true); - } else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock) && textBlock.getMostPopularWordFontSize() == document.getFontSizeCounter() - .getMostPopular() && textBlock.getMostPopularWordStyle().equals("bold") && !document.getFontStyleCounter().getMostPopular().equals("bold")) { + } else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock) + && textBlock.getMostPopularWordFontSize() == document.getFontSizeCounter().getMostPopular() + && textBlock.getMostPopularWordStyle().equals("bold") + && !document.getFontStyleCounter().getMostPopular().equals("bold")) { textBlock.setClassification(PageBlockType.PARAGRAPH_BOLD); - } else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock) && textBlock.getMostPopularWordFont() - .equals(document.getFontCounter().getMostPopular()) && textBlock.getMostPopularWordStyle() - .equals(document.getFontStyleCounter().getMostPopular()) && textBlock.getMostPopularWordFontSize() == document.getFontSizeCounter().getMostPopular()) { + } else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock) + && textBlock.getMostPopularWordFont().equals(document.getFontCounter().getMostPopular()) + && textBlock.getMostPopularWordStyle().equals(document.getFontStyleCounter().getMostPopular()) + && textBlock.getMostPopularWordFontSize() == document.getFontSizeCounter().getMostPopular()) { textBlock.setClassification(PageBlockType.PARAGRAPH); - } else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock) && textBlock.getMostPopularWordFontSize() == document.getFontSizeCounter() - .getMostPopular() && textBlock.getMostPopularWordStyle().equals("italic") && !document.getFontStyleCounter() - .getMostPopular() - .equals("italic") && PositionUtils.getApproxLineCount(textBlock) < 2.9) { + } else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock) + && textBlock.getMostPopularWordFontSize() == document.getFontSizeCounter().getMostPopular() + && textBlock.getMostPopularWordStyle().equals("italic") + && !document.getFontStyleCounter().getMostPopular().equals("italic") + && PositionUtils.getApproxLineCount(textBlock) < 2.9) { textBlock.setClassification(PageBlockType.PARAGRAPH_ITALIC); } else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock)) { textBlock.setClassification(PageBlockType.PARAGRAPH_UNKNOWN);