Merge branch 'RED-5253' into 'main'
RED-5253: Ported last documine changes See merge request fforesight/layout-parser!7
This commit is contained in:
commit
d9a3bbbd30
@ -77,6 +77,9 @@ public class PdfParsingService {
|
||||
stripper.setStartPage(pageNumber);
|
||||
stripper.setEndPage(pageNumber);
|
||||
stripper.setPdpage(pdPage);
|
||||
if(layoutParsingType.equals(LayoutParsingType.DOCUMINE)){
|
||||
stripper.setSortByPosition(true);
|
||||
}
|
||||
stripper.getText(pdDocument);
|
||||
|
||||
PDRectangle pdr = pdPage.getMediaBox();
|
||||
|
||||
@ -5,6 +5,9 @@ import static java.util.stream.Collectors.toSet;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Comparator;
|
||||
import java.util.List;
|
||||
import java.util.regex.Matcher;
|
||||
import java.util.regex.Pattern;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
@ -23,6 +26,8 @@ public class DocuMineBlockificationService {
|
||||
|
||||
static final float THRESHOLD = 1f;
|
||||
|
||||
Pattern pattern = Pattern.compile("^(\\d{1,2}\\.){1,3}\\d{1,2}\\.?\\s[0-9A-Za-z ()-]{2,50}", Pattern.CASE_INSENSITIVE);
|
||||
|
||||
|
||||
/**
|
||||
* This method is building blocks by expanding the minX/maxX and minY/maxY value on each word that is not split by the conditions.
|
||||
@ -39,11 +44,7 @@ public class DocuMineBlockificationService {
|
||||
List<TextPositionSequence> chunkWords = new ArrayList<>();
|
||||
List<AbstractPageBlock> chunkBlockList1 = new ArrayList<>();
|
||||
|
||||
float minX = 1000;
|
||||
float maxX = 0;
|
||||
float minY = 1000;
|
||||
float maxY = 0;
|
||||
|
||||
float minX = 1000, maxX = 0, minY = 1000, maxY = 0;
|
||||
TextPositionSequence prev = null;
|
||||
|
||||
boolean wasSplitted = false;
|
||||
@ -60,7 +61,10 @@ public class DocuMineBlockificationService {
|
||||
boolean splitByOtherFontAndOtherY = prev != null && prev.getMaxYDirAdj() != word.getMaxYDirAdj() && (word.getFontStyle().contains("bold") && !prev.getFontStyle()
|
||||
.contains("bold") || prev.getFontStyle().contains("bold") && !word.getFontStyle().contains("bold"));
|
||||
|
||||
if (prev != null && (lineSeparation || startFromTop || splitByDir || isSplitByRuling || splitByOtherFontAndOtherY || negativeXGap)) {
|
||||
Matcher matcher = pattern.matcher(chunkWords.stream().collect(Collectors.joining(" ")).toString());
|
||||
boolean startsOnSameX = Math.abs(minX - word.getMinXDirAdj()) < 5 && matcher.matches();
|
||||
|
||||
if (prev != null && (lineSeparation || startFromTop || splitByDir || isSplitByRuling || splitByOtherFontAndOtherY || negativeXGap || startsOnSameX)) {
|
||||
|
||||
Orientation prevOrientation = null;
|
||||
if (!chunkBlockList1.isEmpty()) {
|
||||
@ -231,3 +235,4 @@ public class DocuMineBlockificationService {
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
@ -20,15 +20,14 @@ import com.knecon.fforesight.service.layoutparser.processor.utils.PositionUtils;
|
||||
import lombok.RequiredArgsConstructor;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
|
||||
|
||||
@Slf4j
|
||||
@Service
|
||||
@RequiredArgsConstructor
|
||||
public class DocuMineClassificationService {
|
||||
|
||||
private final BodyTextFrameService bodyTextFrameService;
|
||||
private static final Pattern pattern = Pattern.compile("^(\\d{1,1}\\.?){1,3}\\d{1,2}\\.?\\s[0-9A-Za-z\\[\\]\\-]{2,50}", Pattern.CASE_INSENSITIVE);
|
||||
private static final Pattern pattern2 = Pattern.compile(".*\\d{4}$", Pattern.CASE_INSENSITIVE);
|
||||
private static final Pattern pattern = Pattern.compile("^(\\d{1,2}\\.){1,3}\\d{1,2}\\.?\\s[0-9A-Za-z \\[\\]]{2,50}", Pattern.CASE_INSENSITIVE);
|
||||
private static final Pattern pattern2 = Pattern.compile("\\p{L}{3,}", Pattern.CASE_INSENSITIVE);
|
||||
private static final Pattern pattern3 = Pattern.compile("^(\\d{1,1}\\.){1,3}\\d{1,2}\\.?\\s[a-z]{1,2}\\/[a-z]{1,2}.*");
|
||||
|
||||
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user