From 150aea55c08bde4e079886edf3b8d7859d5dd371 Mon Sep 17 00:00:00 2001 From: deiflaender Date: Fri, 4 Aug 2023 09:55:35 +0200 Subject: [PATCH] RED-5253: Ported last documine changes --- .../processor/services/PdfParsingService.java | 3 +++ .../DocuMineBlockificationService.java | 17 +++++++++++------ .../DocuMineClassificationService.java | 5 ++--- 3 files changed, 16 insertions(+), 9 deletions(-) diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/PdfParsingService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/PdfParsingService.java index 1f71d6d..d2fd738 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/PdfParsingService.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/PdfParsingService.java @@ -77,6 +77,9 @@ public class PdfParsingService { stripper.setStartPage(pageNumber); stripper.setEndPage(pageNumber); stripper.setPdpage(pdPage); + if(layoutParsingType.equals(LayoutParsingType.DOCUMINE)){ + stripper.setSortByPosition(true); + } stripper.getText(pdDocument); PDRectangle pdr = pdPage.getMediaBox(); diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/blockification/DocuMineBlockificationService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/blockification/DocuMineBlockificationService.java index a141621..8899c9b 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/blockification/DocuMineBlockificationService.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/blockification/DocuMineBlockificationService.java @@ -5,6 +5,9 @@ import static java.util.stream.Collectors.toSet; import java.util.ArrayList; import java.util.Comparator; import java.util.List; +import java.util.regex.Matcher; +import java.util.regex.Pattern; +import java.util.stream.Collectors; import org.springframework.stereotype.Service; @@ -23,6 +26,8 @@ public class DocuMineBlockificationService { static final float THRESHOLD = 1f; + Pattern pattern = Pattern.compile("^(\\d{1,2}\\.){1,3}\\d{1,2}\\.?\\s[0-9A-Za-z ()-]{2,50}", Pattern.CASE_INSENSITIVE); + /** * This method is building blocks by expanding the minX/maxX and minY/maxY value on each word that is not split by the conditions. @@ -39,11 +44,7 @@ public class DocuMineBlockificationService { List chunkWords = new ArrayList<>(); List chunkBlockList1 = new ArrayList<>(); - float minX = 1000; - float maxX = 0; - float minY = 1000; - float maxY = 0; - + float minX = 1000, maxX = 0, minY = 1000, maxY = 0; TextPositionSequence prev = null; boolean wasSplitted = false; @@ -60,7 +61,10 @@ public class DocuMineBlockificationService { boolean splitByOtherFontAndOtherY = prev != null && prev.getMaxYDirAdj() != word.getMaxYDirAdj() && (word.getFontStyle().contains("bold") && !prev.getFontStyle() .contains("bold") || prev.getFontStyle().contains("bold") && !word.getFontStyle().contains("bold")); - if (prev != null && (lineSeparation || startFromTop || splitByDir || isSplitByRuling || splitByOtherFontAndOtherY || negativeXGap)) { + Matcher matcher = pattern.matcher(chunkWords.stream().collect(Collectors.joining(" ")).toString()); + boolean startsOnSameX = Math.abs(minX - word.getMinXDirAdj()) < 5 && matcher.matches(); + + if (prev != null && (lineSeparation || startFromTop || splitByDir || isSplitByRuling || splitByOtherFontAndOtherY || negativeXGap || startsOnSameX)) { Orientation prevOrientation = null; if (!chunkBlockList1.isEmpty()) { @@ -231,3 +235,4 @@ public class DocuMineBlockificationService { } } + diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/classification/DocuMineClassificationService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/classification/DocuMineClassificationService.java index f513145..691c60c 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/classification/DocuMineClassificationService.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/classification/DocuMineClassificationService.java @@ -20,15 +20,14 @@ import com.knecon.fforesight.service.layoutparser.processor.utils.PositionUtils; import lombok.RequiredArgsConstructor; import lombok.extern.slf4j.Slf4j; - @Slf4j @Service @RequiredArgsConstructor public class DocuMineClassificationService { private final BodyTextFrameService bodyTextFrameService; - private static final Pattern pattern = Pattern.compile("^(\\d{1,1}\\.?){1,3}\\d{1,2}\\.?\\s[0-9A-Za-z\\[\\]\\-]{2,50}", Pattern.CASE_INSENSITIVE); - private static final Pattern pattern2 = Pattern.compile(".*\\d{4}$", Pattern.CASE_INSENSITIVE); + private static final Pattern pattern = Pattern.compile("^(\\d{1,2}\\.){1,3}\\d{1,2}\\.?\\s[0-9A-Za-z \\[\\]]{2,50}", Pattern.CASE_INSENSITIVE); + private static final Pattern pattern2 = Pattern.compile("\\p{L}{3,}", Pattern.CASE_INSENSITIVE); private static final Pattern pattern3 = Pattern.compile("^(\\d{1,1}\\.){1,3}\\d{1,2}\\.?\\s[a-z]{1,2}\\/[a-z]{1,2}.*");