Merge branch 'RED-5253' into 'main'

RED-5253: Ported last documine changes

See merge request fforesight/layout-parser!7
This commit is contained in:
Dominique Eifländer 2023-08-04 09:59:57 +02:00
commit d9a3bbbd30
3 changed files with 16 additions and 9 deletions

View File

@ -77,6 +77,9 @@ public class PdfParsingService {
stripper.setStartPage(pageNumber);
stripper.setEndPage(pageNumber);
stripper.setPdpage(pdPage);
if(layoutParsingType.equals(LayoutParsingType.DOCUMINE)){
stripper.setSortByPosition(true);
}
stripper.getText(pdDocument);
PDRectangle pdr = pdPage.getMediaBox();

View File

@ -5,6 +5,9 @@ import static java.util.stream.Collectors.toSet;
import java.util.ArrayList;
import java.util.Comparator;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.stream.Collectors;
import org.springframework.stereotype.Service;
@ -23,6 +26,8 @@ public class DocuMineBlockificationService {
static final float THRESHOLD = 1f;
Pattern pattern = Pattern.compile("^(\\d{1,2}\\.){1,3}\\d{1,2}\\.?\\s[0-9A-Za-z ()-]{2,50}", Pattern.CASE_INSENSITIVE);
/**
* This method is building blocks by expanding the minX/maxX and minY/maxY value on each word that is not split by the conditions.
@ -39,11 +44,7 @@ public class DocuMineBlockificationService {
List<TextPositionSequence> chunkWords = new ArrayList<>();
List<AbstractPageBlock> chunkBlockList1 = new ArrayList<>();
float minX = 1000;
float maxX = 0;
float minY = 1000;
float maxY = 0;
float minX = 1000, maxX = 0, minY = 1000, maxY = 0;
TextPositionSequence prev = null;
boolean wasSplitted = false;
@ -60,7 +61,10 @@ public class DocuMineBlockificationService {
boolean splitByOtherFontAndOtherY = prev != null && prev.getMaxYDirAdj() != word.getMaxYDirAdj() && (word.getFontStyle().contains("bold") && !prev.getFontStyle()
.contains("bold") || prev.getFontStyle().contains("bold") && !word.getFontStyle().contains("bold"));
if (prev != null && (lineSeparation || startFromTop || splitByDir || isSplitByRuling || splitByOtherFontAndOtherY || negativeXGap)) {
Matcher matcher = pattern.matcher(chunkWords.stream().collect(Collectors.joining(" ")).toString());
boolean startsOnSameX = Math.abs(minX - word.getMinXDirAdj()) < 5 && matcher.matches();
if (prev != null && (lineSeparation || startFromTop || splitByDir || isSplitByRuling || splitByOtherFontAndOtherY || negativeXGap || startsOnSameX)) {
Orientation prevOrientation = null;
if (!chunkBlockList1.isEmpty()) {
@ -231,3 +235,4 @@ public class DocuMineBlockificationService {
}
}

View File

@ -20,15 +20,14 @@ import com.knecon.fforesight.service.layoutparser.processor.utils.PositionUtils;
import lombok.RequiredArgsConstructor;
import lombok.extern.slf4j.Slf4j;
@Slf4j
@Service
@RequiredArgsConstructor
public class DocuMineClassificationService {
private final BodyTextFrameService bodyTextFrameService;
private static final Pattern pattern = Pattern.compile("^(\\d{1,1}\\.?){1,3}\\d{1,2}\\.?\\s[0-9A-Za-z\\[\\]\\-]{2,50}", Pattern.CASE_INSENSITIVE);
private static final Pattern pattern2 = Pattern.compile(".*\\d{4}$", Pattern.CASE_INSENSITIVE);
private static final Pattern pattern = Pattern.compile("^(\\d{1,2}\\.){1,3}\\d{1,2}\\.?\\s[0-9A-Za-z \\[\\]]{2,50}", Pattern.CASE_INSENSITIVE);
private static final Pattern pattern2 = Pattern.compile("\\p{L}{3,}", Pattern.CASE_INSENSITIVE);
private static final Pattern pattern3 = Pattern.compile("^(\\d{1,1}\\.){1,3}\\d{1,2}\\.?\\s[a-z]{1,2}\\/[a-z]{1,2}.*");