RED-5253: Ported last documine changes
This commit is contained in:
parent
676f0c9d09
commit
150aea55c0
@ -77,6 +77,9 @@ public class PdfParsingService {
|
|||||||
stripper.setStartPage(pageNumber);
|
stripper.setStartPage(pageNumber);
|
||||||
stripper.setEndPage(pageNumber);
|
stripper.setEndPage(pageNumber);
|
||||||
stripper.setPdpage(pdPage);
|
stripper.setPdpage(pdPage);
|
||||||
|
if(layoutParsingType.equals(LayoutParsingType.DOCUMINE)){
|
||||||
|
stripper.setSortByPosition(true);
|
||||||
|
}
|
||||||
stripper.getText(pdDocument);
|
stripper.getText(pdDocument);
|
||||||
|
|
||||||
PDRectangle pdr = pdPage.getMediaBox();
|
PDRectangle pdr = pdPage.getMediaBox();
|
||||||
|
|||||||
@ -5,6 +5,9 @@ import static java.util.stream.Collectors.toSet;
|
|||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
import java.util.Comparator;
|
import java.util.Comparator;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
import java.util.regex.Matcher;
|
||||||
|
import java.util.regex.Pattern;
|
||||||
|
import java.util.stream.Collectors;
|
||||||
|
|
||||||
import org.springframework.stereotype.Service;
|
import org.springframework.stereotype.Service;
|
||||||
|
|
||||||
@ -23,6 +26,8 @@ public class DocuMineBlockificationService {
|
|||||||
|
|
||||||
static final float THRESHOLD = 1f;
|
static final float THRESHOLD = 1f;
|
||||||
|
|
||||||
|
Pattern pattern = Pattern.compile("^(\\d{1,2}\\.){1,3}\\d{1,2}\\.?\\s[0-9A-Za-z ()-]{2,50}", Pattern.CASE_INSENSITIVE);
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* This method is building blocks by expanding the minX/maxX and minY/maxY value on each word that is not split by the conditions.
|
* This method is building blocks by expanding the minX/maxX and minY/maxY value on each word that is not split by the conditions.
|
||||||
@ -39,11 +44,7 @@ public class DocuMineBlockificationService {
|
|||||||
List<TextPositionSequence> chunkWords = new ArrayList<>();
|
List<TextPositionSequence> chunkWords = new ArrayList<>();
|
||||||
List<AbstractPageBlock> chunkBlockList1 = new ArrayList<>();
|
List<AbstractPageBlock> chunkBlockList1 = new ArrayList<>();
|
||||||
|
|
||||||
float minX = 1000;
|
float minX = 1000, maxX = 0, minY = 1000, maxY = 0;
|
||||||
float maxX = 0;
|
|
||||||
float minY = 1000;
|
|
||||||
float maxY = 0;
|
|
||||||
|
|
||||||
TextPositionSequence prev = null;
|
TextPositionSequence prev = null;
|
||||||
|
|
||||||
boolean wasSplitted = false;
|
boolean wasSplitted = false;
|
||||||
@ -60,7 +61,10 @@ public class DocuMineBlockificationService {
|
|||||||
boolean splitByOtherFontAndOtherY = prev != null && prev.getMaxYDirAdj() != word.getMaxYDirAdj() && (word.getFontStyle().contains("bold") && !prev.getFontStyle()
|
boolean splitByOtherFontAndOtherY = prev != null && prev.getMaxYDirAdj() != word.getMaxYDirAdj() && (word.getFontStyle().contains("bold") && !prev.getFontStyle()
|
||||||
.contains("bold") || prev.getFontStyle().contains("bold") && !word.getFontStyle().contains("bold"));
|
.contains("bold") || prev.getFontStyle().contains("bold") && !word.getFontStyle().contains("bold"));
|
||||||
|
|
||||||
if (prev != null && (lineSeparation || startFromTop || splitByDir || isSplitByRuling || splitByOtherFontAndOtherY || negativeXGap)) {
|
Matcher matcher = pattern.matcher(chunkWords.stream().collect(Collectors.joining(" ")).toString());
|
||||||
|
boolean startsOnSameX = Math.abs(minX - word.getMinXDirAdj()) < 5 && matcher.matches();
|
||||||
|
|
||||||
|
if (prev != null && (lineSeparation || startFromTop || splitByDir || isSplitByRuling || splitByOtherFontAndOtherY || negativeXGap || startsOnSameX)) {
|
||||||
|
|
||||||
Orientation prevOrientation = null;
|
Orientation prevOrientation = null;
|
||||||
if (!chunkBlockList1.isEmpty()) {
|
if (!chunkBlockList1.isEmpty()) {
|
||||||
@ -231,3 +235,4 @@ public class DocuMineBlockificationService {
|
|||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@ -20,15 +20,14 @@ import com.knecon.fforesight.service.layoutparser.processor.utils.PositionUtils;
|
|||||||
import lombok.RequiredArgsConstructor;
|
import lombok.RequiredArgsConstructor;
|
||||||
import lombok.extern.slf4j.Slf4j;
|
import lombok.extern.slf4j.Slf4j;
|
||||||
|
|
||||||
|
|
||||||
@Slf4j
|
@Slf4j
|
||||||
@Service
|
@Service
|
||||||
@RequiredArgsConstructor
|
@RequiredArgsConstructor
|
||||||
public class DocuMineClassificationService {
|
public class DocuMineClassificationService {
|
||||||
|
|
||||||
private final BodyTextFrameService bodyTextFrameService;
|
private final BodyTextFrameService bodyTextFrameService;
|
||||||
private static final Pattern pattern = Pattern.compile("^(\\d{1,1}\\.?){1,3}\\d{1,2}\\.?\\s[0-9A-Za-z\\[\\]\\-]{2,50}", Pattern.CASE_INSENSITIVE);
|
private static final Pattern pattern = Pattern.compile("^(\\d{1,2}\\.){1,3}\\d{1,2}\\.?\\s[0-9A-Za-z \\[\\]]{2,50}", Pattern.CASE_INSENSITIVE);
|
||||||
private static final Pattern pattern2 = Pattern.compile(".*\\d{4}$", Pattern.CASE_INSENSITIVE);
|
private static final Pattern pattern2 = Pattern.compile("\\p{L}{3,}", Pattern.CASE_INSENSITIVE);
|
||||||
private static final Pattern pattern3 = Pattern.compile("^(\\d{1,1}\\.){1,3}\\d{1,2}\\.?\\s[a-z]{1,2}\\/[a-z]{1,2}.*");
|
private static final Pattern pattern3 = Pattern.compile("^(\\d{1,1}\\.){1,3}\\d{1,2}\\.?\\s[a-z]{1,2}\\/[a-z]{1,2}.*");
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user