Compare commits
14 Commits
main
...
release/0.
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
b1a054267b | ||
|
|
43dec8744a | ||
|
|
e2a5b85c4a | ||
|
|
d5a4dd4d42 | ||
|
|
acd6d7f164 | ||
|
|
71025f7f16 | ||
|
|
ae6bad830e | ||
|
|
e030ec9dd2 | ||
|
|
49139ee603 | ||
|
|
07da43f2d9 | ||
|
|
df0bbc92c7 | ||
|
|
0497d764ec | ||
|
|
1362e4fbb2 | ||
|
|
665ad40b0b |
@ -16,6 +16,8 @@ deploy:
|
|||||||
reports:
|
reports:
|
||||||
dotenv: version.env
|
dotenv: version.env
|
||||||
rules:
|
rules:
|
||||||
- if: $CI_COMMIT_BRANCH == $CI_DEFAULT_BRANCH
|
- if: $CI_COMMIT_BRANCH == $CI_DEFAULT_BRANCH
|
||||||
- if: $CI_COMMIT_BRANCH =~ /^release/
|
- if: $CI_COMMIT_BRANCH =~ /^release/
|
||||||
- if: $CI_COMMIT_TAG
|
- if: $CI_COMMIT_TAG
|
||||||
|
pmd:
|
||||||
|
allow_failure: true
|
||||||
|
|||||||
@ -1,6 +1,6 @@
|
|||||||
plugins {
|
plugins {
|
||||||
id("com.knecon.fforesight.java-conventions")
|
id("com.knecon.fforesight.java-conventions")
|
||||||
id("io.freefair.lombok") version "8.2.2"
|
id("io.freefair.lombok") version "8.6"
|
||||||
}
|
}
|
||||||
|
|
||||||
description = "layoutparser-service-internal-api"
|
description = "layoutparser-service-internal-api"
|
||||||
|
|||||||
@ -1,6 +1,6 @@
|
|||||||
plugins {
|
plugins {
|
||||||
id("com.knecon.fforesight.java-conventions")
|
id("com.knecon.fforesight.java-conventions")
|
||||||
id("io.freefair.lombok") version "8.2.2"
|
id("io.freefair.lombok") version "8.6"
|
||||||
}
|
}
|
||||||
|
|
||||||
description = "layoutparser-service-processor"
|
description = "layoutparser-service-processor"
|
||||||
|
|||||||
@ -83,13 +83,17 @@ public class LayoutParsingPipeline {
|
|||||||
|
|
||||||
try (PDDocument originDocument = layoutParsingStorageService.getOriginFile(layoutParsingRequest.originFileStorageId())) {
|
try (PDDocument originDocument = layoutParsingStorageService.getOriginFile(layoutParsingRequest.originFileStorageId())) {
|
||||||
ImageServiceResponse imageServiceResponse = new ImageServiceResponse();
|
ImageServiceResponse imageServiceResponse = new ImageServiceResponse();
|
||||||
if (layoutParsingRequest.imagesFileStorageId().isPresent()) {
|
if (layoutParsingRequest.imagesFileStorageId()
|
||||||
imageServiceResponse = layoutParsingStorageService.getImagesFile(layoutParsingRequest.imagesFileStorageId().get());
|
.isPresent()) {
|
||||||
|
imageServiceResponse = layoutParsingStorageService.getImagesFile(layoutParsingRequest.imagesFileStorageId()
|
||||||
|
.get());
|
||||||
}
|
}
|
||||||
|
|
||||||
TableServiceResponse tableServiceResponse = new TableServiceResponse();
|
TableServiceResponse tableServiceResponse = new TableServiceResponse();
|
||||||
if (layoutParsingRequest.tablesFileStorageId().isPresent()) {
|
if (layoutParsingRequest.tablesFileStorageId()
|
||||||
tableServiceResponse = layoutParsingStorageService.getTablesFile(layoutParsingRequest.tablesFileStorageId().get());
|
.isPresent()) {
|
||||||
|
tableServiceResponse = layoutParsingStorageService.getTablesFile(layoutParsingRequest.tablesFileStorageId()
|
||||||
|
.get());
|
||||||
}
|
}
|
||||||
|
|
||||||
ClassificationDocument classificationDocument = parseLayout(layoutParsingRequest.layoutParsingType(), originDocument, imageServiceResponse, tableServiceResponse);
|
ClassificationDocument classificationDocument = parseLayout(layoutParsingRequest.layoutParsingType(), originDocument, imageServiceResponse, tableServiceResponse);
|
||||||
@ -115,25 +119,25 @@ public class LayoutParsingPipeline {
|
|||||||
.numberOfPages(numberOfPages)
|
.numberOfPages(numberOfPages)
|
||||||
.duration(System.currentTimeMillis() - start)
|
.duration(System.currentTimeMillis() - start)
|
||||||
.message(format("""
|
.message(format("""
|
||||||
Layout parsing has finished in %.02f s.
|
Layout parsing has finished in %.02f s.
|
||||||
identifiers: %s
|
identifiers: %s
|
||||||
%s
|
%s
|
||||||
Files have been saved with Ids:
|
Files have been saved with Ids:
|
||||||
Structure: %s
|
Structure: %s
|
||||||
Text: %s
|
Text: %s
|
||||||
Positions: %s
|
Positions: %s
|
||||||
PageData: %s
|
PageData: %s
|
||||||
Simplified Text: %s
|
Simplified Text: %s
|
||||||
Viewer Doc: %s""",
|
Viewer Doc: %s""",
|
||||||
((float) (System.currentTimeMillis() - start)) / 1000,
|
((float) (System.currentTimeMillis() - start)) / 1000,
|
||||||
layoutParsingRequest.identifier(),
|
layoutParsingRequest.identifier(),
|
||||||
buildSemanticNodeCountMessage(numberOfPages, documentGraph.buildSemanticNodeCounts()),
|
buildSemanticNodeCountMessage(numberOfPages, documentGraph.buildSemanticNodeCounts()),
|
||||||
layoutParsingRequest.structureFileStorageId(),
|
layoutParsingRequest.structureFileStorageId(),
|
||||||
layoutParsingRequest.textBlockFileStorageId(),
|
layoutParsingRequest.textBlockFileStorageId(),
|
||||||
layoutParsingRequest.positionBlockFileStorageId(),
|
layoutParsingRequest.positionBlockFileStorageId(),
|
||||||
layoutParsingRequest.pageFileStorageId(),
|
layoutParsingRequest.pageFileStorageId(),
|
||||||
layoutParsingRequest.simplifiedTextStorageId(),
|
layoutParsingRequest.simplifiedTextStorageId(),
|
||||||
layoutParsingRequest.viewerDocumentStorageId()))
|
layoutParsingRequest.viewerDocumentStorageId()))
|
||||||
.build();
|
.build();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -142,14 +146,14 @@ public class LayoutParsingPipeline {
|
|||||||
private String buildSemanticNodeCountMessage(int numberOfPages, Map<NodeType, Long> semanticNodeCounts) {
|
private String buildSemanticNodeCountMessage(int numberOfPages, Map<NodeType, Long> semanticNodeCounts) {
|
||||||
|
|
||||||
return String.format("%d pages with %d sections, %d headlines, %d paragraphs, %d tables with %d cells, %d headers, and %d footers parsed",
|
return String.format("%d pages with %d sections, %d headlines, %d paragraphs, %d tables with %d cells, %d headers, and %d footers parsed",
|
||||||
numberOfPages,
|
numberOfPages,
|
||||||
semanticNodeCounts.get(NodeType.SECTION) == null ? 0 : semanticNodeCounts.get(NodeType.SECTION),
|
semanticNodeCounts.get(NodeType.SECTION) == null ? 0 : semanticNodeCounts.get(NodeType.SECTION),
|
||||||
semanticNodeCounts.get(NodeType.HEADLINE) == null ? 0 : semanticNodeCounts.get(NodeType.HEADLINE),
|
semanticNodeCounts.get(NodeType.HEADLINE) == null ? 0 : semanticNodeCounts.get(NodeType.HEADLINE),
|
||||||
semanticNodeCounts.get(NodeType.PARAGRAPH) == null ? 0 : semanticNodeCounts.get(NodeType.PARAGRAPH),
|
semanticNodeCounts.get(NodeType.PARAGRAPH) == null ? 0 : semanticNodeCounts.get(NodeType.PARAGRAPH),
|
||||||
semanticNodeCounts.get(NodeType.TABLE) == null ? 0 : semanticNodeCounts.get(NodeType.TABLE),
|
semanticNodeCounts.get(NodeType.TABLE) == null ? 0 : semanticNodeCounts.get(NodeType.TABLE),
|
||||||
semanticNodeCounts.get(NodeType.TABLE_CELL) == null ? 0 : semanticNodeCounts.get(NodeType.TABLE_CELL),
|
semanticNodeCounts.get(NodeType.TABLE_CELL) == null ? 0 : semanticNodeCounts.get(NodeType.TABLE_CELL),
|
||||||
semanticNodeCounts.get(NodeType.HEADER) == null ? 0 : semanticNodeCounts.get(NodeType.HEADER),
|
semanticNodeCounts.get(NodeType.HEADER) == null ? 0 : semanticNodeCounts.get(NodeType.HEADER),
|
||||||
semanticNodeCounts.get(NodeType.FOOTER) == null ? 0 : semanticNodeCounts.get(NodeType.FOOTER));
|
semanticNodeCounts.get(NodeType.FOOTER) == null ? 0 : semanticNodeCounts.get(NodeType.FOOTER));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@ -213,7 +217,7 @@ public class LayoutParsingPipeline {
|
|||||||
tableExtractionService.extractTables(cleanRulings, classificationPage);
|
tableExtractionService.extractTables(cleanRulings, classificationPage);
|
||||||
|
|
||||||
buildPageStatistics(classificationPage);
|
buildPageStatistics(classificationPage);
|
||||||
increaseDocumentStatistics(classificationPage, classificationDocument);
|
increaseDocumentStatistics(layoutParsingType, classificationPage, classificationDocument);
|
||||||
|
|
||||||
classificationPages.add(classificationPage);
|
classificationPages.add(classificationPage);
|
||||||
}
|
}
|
||||||
@ -242,11 +246,11 @@ public class LayoutParsingPipeline {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
private void increaseDocumentStatistics(ClassificationPage classificationPage, ClassificationDocument document) {
|
private void increaseDocumentStatistics(LayoutParsingType layoutParsingType, ClassificationPage classificationPage, ClassificationDocument document) {
|
||||||
|
|
||||||
if (!classificationPage.isLandscape()) {
|
if (layoutParsingType.equals(LayoutParsingType.DOCUMINE) || !classificationPage.isLandscape()) {
|
||||||
document.getFontSizeCounter().addAll(classificationPage.getFontSizeCounter().getCountPerValue());
|
document.getFontSizeCounter().addAll(classificationPage.getFontSizeCounter().getCountPerValue());
|
||||||
}
|
}
|
||||||
document.getFontCounter().addAll(classificationPage.getFontCounter().getCountPerValue());
|
document.getFontCounter().addAll(classificationPage.getFontCounter().getCountPerValue());
|
||||||
document.getTextHeightCounter().addAll(classificationPage.getTextHeightCounter().getCountPerValue());
|
document.getTextHeightCounter().addAll(classificationPage.getTextHeightCounter().getCountPerValue());
|
||||||
document.getFontStyleCounter().addAll(classificationPage.getFontStyleCounter().getCountPerValue());
|
document.getFontStyleCounter().addAll(classificationPage.getFontStyleCounter().getCountPerValue());
|
||||||
|
|||||||
@ -196,6 +196,12 @@ public class TextPositionSequence implements CharSequence {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public float getTextHeightNoPadding() {
|
||||||
|
|
||||||
|
return textPositions.get(0).getHeightDir();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
@JsonIgnore
|
@JsonIgnore
|
||||||
@JsonAttribute(ignore = true)
|
@JsonAttribute(ignore = true)
|
||||||
public float getTextHeight() {
|
public float getTextHeight() {
|
||||||
@ -234,6 +240,7 @@ public class TextPositionSequence implements CharSequence {
|
|||||||
@JsonIgnore
|
@JsonIgnore
|
||||||
@JsonAttribute(ignore = true)
|
@JsonAttribute(ignore = true)
|
||||||
public String getFontStyle() {
|
public String getFontStyle() {
|
||||||
|
|
||||||
if (textPositions.get(0).getFontName() == null) {
|
if (textPositions.get(0).getFontName() == null) {
|
||||||
return "standard";
|
return "standard";
|
||||||
}
|
}
|
||||||
|
|||||||
@ -61,8 +61,8 @@ public class DocuMineBlockificationService {
|
|||||||
boolean newLineAfterSplit = prev != null && word.getMinYDirAdj() != prev.getMinYDirAdj() && wasSplitted && splitX1 != word.getMinXDirAdj();
|
boolean newLineAfterSplit = prev != null && word.getMinYDirAdj() != prev.getMinYDirAdj() && wasSplitted && splitX1 != word.getMinXDirAdj();
|
||||||
boolean isSplitByRuling = isSplitByRuling(minX, minY, maxX, maxY, word, horizontalRulingLines, verticalRulingLines);
|
boolean isSplitByRuling = isSplitByRuling(minX, minY, maxX, maxY, word, horizontalRulingLines, verticalRulingLines);
|
||||||
boolean splitByDir = prev != null && !prev.getDir().equals(word.getDir());
|
boolean splitByDir = prev != null && !prev.getDir().equals(word.getDir());
|
||||||
boolean splitByOtherFontAndOtherY = prev != null && prev.getMaxYDirAdj() != word.getMaxYDirAdj() && (word.getFontStyle().contains("bold") && !prev.getFontStyle()
|
boolean splitByOtherFontAndOtherY = prev != null && Math.abs(prev.getMaxYDirAdj() - word.getMaxYDirAdj()) > word.getTextHeight() * 0.2 && (word.getFontStyle()
|
||||||
.contains("bold") || prev.getFontStyle().contains("bold") && !word.getFontStyle().contains("bold"));
|
.contains("bold") && !prev.getFontStyle().contains("bold") || prev.getFontStyle().contains("bold") && !word.getFontStyle().contains("bold"));
|
||||||
|
|
||||||
Matcher matcher = pattern.matcher(chunkWords.stream().collect(Collectors.joining(" ")).toString());
|
Matcher matcher = pattern.matcher(chunkWords.stream().collect(Collectors.joining(" ")).toString());
|
||||||
boolean startsOnSameX = Math.abs(minX - word.getMinXDirAdj()) < 5 && matcher.matches();
|
boolean startsOnSameX = Math.abs(minX - word.getMinXDirAdj()) < 5 && matcher.matches();
|
||||||
|
|||||||
@ -5,7 +5,6 @@ import java.util.Locale;
|
|||||||
import java.util.regex.Matcher;
|
import java.util.regex.Matcher;
|
||||||
import java.util.regex.Pattern;
|
import java.util.regex.Pattern;
|
||||||
|
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.utils.MarkedContentUtils;
|
|
||||||
import org.springframework.stereotype.Service;
|
import org.springframework.stereotype.Service;
|
||||||
|
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
|
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
|
||||||
@ -13,6 +12,8 @@ import com.knecon.fforesight.service.layoutparser.processor.model.Classification
|
|||||||
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
|
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.PageBlockType;
|
import com.knecon.fforesight.service.layoutparser.processor.model.PageBlockType;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
|
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.utils.HeaderFooterDetection;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.utils.MarkedContentUtils;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.utils.PositionUtils;
|
import com.knecon.fforesight.service.layoutparser.processor.utils.PositionUtils;
|
||||||
|
|
||||||
import lombok.RequiredArgsConstructor;
|
import lombok.RequiredArgsConstructor;
|
||||||
@ -23,7 +24,7 @@ import lombok.extern.slf4j.Slf4j;
|
|||||||
@RequiredArgsConstructor
|
@RequiredArgsConstructor
|
||||||
public class DocuMineClassificationService {
|
public class DocuMineClassificationService {
|
||||||
|
|
||||||
private static final Pattern pattern = Pattern.compile("^(\\d{1,2}\\.){1,3}\\d{1,2}\\.?\\s[0-9A-Za-z \\[\\]]{2,50}", Pattern.CASE_INSENSITIVE);
|
private static final Pattern pattern = Pattern.compile("^([1-9]\\d?\\.){1,3}\\d{1,2}\\.?\\s[0-9A-Za-z \\[\\]]{2,50}", Pattern.CASE_INSENSITIVE);
|
||||||
private static final Pattern pattern2 = Pattern.compile("\\p{L}{3,}", Pattern.CASE_INSENSITIVE);
|
private static final Pattern pattern2 = Pattern.compile("\\p{L}{3,}", Pattern.CASE_INSENSITIVE);
|
||||||
private static final Pattern pattern3 = Pattern.compile("^(\\d{1,1}\\.){1,3}\\d{1,2}\\.?\\s[a-z]{1,2}\\/[a-z]{1,2}.*");
|
private static final Pattern pattern3 = Pattern.compile("^(\\d{1,1}\\.){1,3}\\d{1,2}\\.?\\s[a-z]{1,2}\\/[a-z]{1,2}.*");
|
||||||
|
|
||||||
@ -64,46 +65,64 @@ public class DocuMineClassificationService {
|
|||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
if (MarkedContentUtils.intersects(textBlock, page.getMarkedContentBboxPerType(), MarkedContentUtils.HEADER)
|
if (MarkedContentUtils.intersects(textBlock, page.getMarkedContentBboxPerType(), MarkedContentUtils.HEADER)
|
||||||
|| PositionUtils.isOverBodyTextFrame(bodyTextFrame, textBlock, page.getRotation()) && (document.getFontSizeCounter()
|
|| (PositionUtils.isOverBodyTextFrame(bodyTextFrame,
|
||||||
.getMostPopular() == null || textBlock.getHighestFontSize() <= document.getFontSizeCounter().getMostPopular())
|
textBlock,
|
||||||
) {
|
page.getRotation()) && (document.getFontSizeCounter().getMostPopular()
|
||||||
|
== null
|
||||||
|
|| textBlock.getHighestFontSize()
|
||||||
|
<= document.getFontSizeCounter()
|
||||||
|
.getMostPopular()))) {
|
||||||
textBlock.setClassification(PageBlockType.HEADER);
|
textBlock.setClassification(PageBlockType.HEADER);
|
||||||
|
|
||||||
} else if (MarkedContentUtils.intersects(textBlock, page.getMarkedContentBboxPerType(), MarkedContentUtils.FOOTER)
|
} else if (MarkedContentUtils.intersects(textBlock, page.getMarkedContentBboxPerType(), MarkedContentUtils.FOOTER)
|
||||||
|| PositionUtils.isUnderBodyTextFrame(bodyTextFrame, textBlock, page.getRotation()) && (document.getFontSizeCounter()
|
|| (PositionUtils.isUnderBodyTextFrame(bodyTextFrame,
|
||||||
.getMostPopular() == null || textBlock.getHighestFontSize() <= document.getFontSizeCounter().getMostPopular())
|
textBlock,
|
||||||
) {
|
page.getRotation())
|
||||||
|
&& (document.getFontSizeCounter().getMostPopular()
|
||||||
|
== null
|
||||||
|
|| textBlock.getHighestFontSize()
|
||||||
|
<= document.getFontSizeCounter()
|
||||||
|
.getMostPopular()))
|
||||||
|
|| HeaderFooterDetection.isLikelyFooter(textBlock, document, page)) {
|
||||||
textBlock.setClassification(PageBlockType.FOOTER);
|
textBlock.setClassification(PageBlockType.FOOTER);
|
||||||
} else if (page.getPageNumber() == 1 && (PositionUtils.getHeightDifferenceBetweenChunkWordAndDocumentWord(textBlock,
|
} else if (page.getPageNumber() == 1 && (PositionUtils.getHeightDifferenceBetweenChunkWordAndDocumentWord(textBlock, document.getTextHeightCounter().getMostPopular()) > 2.5
|
||||||
document.getTextHeightCounter().getMostPopular()) > 2.5 && textBlock.getHighestFontSize() > document.getFontSizeCounter().getMostPopular() || page.getTextBlocks()
|
&& textBlock.getHighestFontSize() > document.getFontSizeCounter().getMostPopular() || page.getTextBlocks().size() == 1)) {
|
||||||
.size() == 1)) {
|
|
||||||
if (!Pattern.matches("[0-9]+", textBlock.toString())) {
|
if (!Pattern.matches("[0-9]+", textBlock.toString())) {
|
||||||
textBlock.setClassification(PageBlockType.TITLE);
|
textBlock.setClassification(PageBlockType.TITLE);
|
||||||
}
|
}
|
||||||
} else if (textBlock.getText().length() > 5 && (textBlock.getMostPopularWordHeight() > document.getTextHeightCounter()
|
} else if (textBlock.getText().length() > 5
|
||||||
.getMostPopular() || textBlock.getMostPopularWordFontSize() > document.getFontSizeCounter().getMostPopular()) && PositionUtils.getApproxLineCount(textBlock) < 5.9
|
&& (textBlock.getMostPopularWordHeight() > document.getTextHeightCounter().getMostPopular()
|
||||||
|
|| textBlock.getMostPopularWordFontSize() > document.getFontSizeCounter().getMostPopular())
|
||||||
&& (textBlock.getMostPopularWordStyle().contains("bold") && Character.isDigit(textBlock.toString().charAt(0)) && !matcher2.matches() && !textBlock.toString()
|
&& PositionUtils.getApproxLineCount(textBlock) < 5.9
|
||||||
.contains(":") || textBlock.toString().equals(textBlock.toString().toUpperCase(Locale.ROOT)) && !matcher2.matches() && !textBlock.toString()
|
&& (textBlock.getMostPopularWordStyle().contains("bold") && Character.isDigit(textBlock.toString().charAt(0)) && matcher2.reset().find() && !textBlock.toString()
|
||||||
.contains(":") || textBlock.toString().startsWith("APPENDIX") || textBlock.toString().startsWith("FIGURE") || textBlock.toString()
|
.contains(":")
|
||||||
.startsWith("TABLE")) && !textBlock.toString().endsWith(":") && matcher2.find()) {
|
|| textBlock.toString().equals(textBlock.toString().toUpperCase(Locale.ROOT)) && matcher2.reset().find() && !textBlock.toString().contains(":")
|
||||||
|
|| textBlock.toString().startsWith("APPENDIX")
|
||||||
|
|| textBlock.toString().startsWith("FIGURE")
|
||||||
|
|| textBlock.toString().startsWith("TABLE"))
|
||||||
|
&& !textBlock.toString().endsWith(":")
|
||||||
|
&& matcher2.reset().find()) {
|
||||||
textBlock.setClassification(PageBlockType.getHeadlineType(1));
|
textBlock.setClassification(PageBlockType.getHeadlineType(1));
|
||||||
document.setHeadlines(true);
|
document.setHeadlines(true);
|
||||||
|
|
||||||
} else if (matcher.find() && PositionUtils.getApproxLineCount(textBlock) < 2.9 && matcher2.find() && !matcher3.matches()) {
|
} else if (matcher.reset().find() && PositionUtils.getApproxLineCount(textBlock) < 2.9 && matcher2.reset().find() && !matcher3.reset().matches()) {
|
||||||
textBlock.setClassification(PageBlockType.getHeadlineType(2));
|
textBlock.setClassification(PageBlockType.getHeadlineType(2));
|
||||||
document.setHeadlines(true);
|
document.setHeadlines(true);
|
||||||
} else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock) && textBlock.getMostPopularWordFontSize() == document.getFontSizeCounter()
|
} else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock)
|
||||||
.getMostPopular() && textBlock.getMostPopularWordStyle().equals("bold") && !document.getFontStyleCounter().getMostPopular().equals("bold")) {
|
&& textBlock.getMostPopularWordFontSize() == document.getFontSizeCounter().getMostPopular()
|
||||||
|
&& textBlock.getMostPopularWordStyle().equals("bold")
|
||||||
|
&& !document.getFontStyleCounter().getMostPopular().equals("bold")) {
|
||||||
textBlock.setClassification(PageBlockType.PARAGRAPH_BOLD);
|
textBlock.setClassification(PageBlockType.PARAGRAPH_BOLD);
|
||||||
} else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock) && textBlock.getMostPopularWordFont()
|
} else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock)
|
||||||
.equals(document.getFontCounter().getMostPopular()) && textBlock.getMostPopularWordStyle()
|
&& textBlock.getMostPopularWordFont().equals(document.getFontCounter().getMostPopular())
|
||||||
.equals(document.getFontStyleCounter().getMostPopular()) && textBlock.getMostPopularWordFontSize() == document.getFontSizeCounter().getMostPopular()) {
|
&& textBlock.getMostPopularWordStyle().equals(document.getFontStyleCounter().getMostPopular())
|
||||||
|
&& textBlock.getMostPopularWordFontSize() == document.getFontSizeCounter().getMostPopular()) {
|
||||||
textBlock.setClassification(PageBlockType.PARAGRAPH);
|
textBlock.setClassification(PageBlockType.PARAGRAPH);
|
||||||
} else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock) && textBlock.getMostPopularWordFontSize() == document.getFontSizeCounter()
|
} else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock)
|
||||||
.getMostPopular() && textBlock.getMostPopularWordStyle().equals("italic") && !document.getFontStyleCounter()
|
&& textBlock.getMostPopularWordFontSize() == document.getFontSizeCounter().getMostPopular()
|
||||||
.getMostPopular()
|
&& textBlock.getMostPopularWordStyle().equals("italic")
|
||||||
.equals("italic") && PositionUtils.getApproxLineCount(textBlock) < 2.9) {
|
&& !document.getFontStyleCounter().getMostPopular().equals("italic")
|
||||||
|
&& PositionUtils.getApproxLineCount(textBlock) < 2.9) {
|
||||||
textBlock.setClassification(PageBlockType.PARAGRAPH_ITALIC);
|
textBlock.setClassification(PageBlockType.PARAGRAPH_ITALIC);
|
||||||
} else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock)) {
|
} else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock)) {
|
||||||
textBlock.setClassification(PageBlockType.PARAGRAPH_UNKNOWN);
|
textBlock.setClassification(PageBlockType.PARAGRAPH_UNKNOWN);
|
||||||
|
|||||||
@ -4,11 +4,9 @@ import java.awt.geom.AffineTransform;
|
|||||||
import java.awt.geom.Rectangle2D;
|
import java.awt.geom.Rectangle2D;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.io.OutputStream;
|
import java.io.OutputStream;
|
||||||
import java.util.HashSet;
|
|
||||||
import java.util.Set;
|
|
||||||
|
|
||||||
import org.apache.pdfbox.cos.COSDictionary;
|
|
||||||
import org.apache.pdfbox.cos.COSName;
|
import org.apache.pdfbox.cos.COSName;
|
||||||
|
import org.apache.pdfbox.pdfwriter.compress.CompressParameters;
|
||||||
import org.apache.pdfbox.pdmodel.PDDocument;
|
import org.apache.pdfbox.pdmodel.PDDocument;
|
||||||
import org.apache.pdfbox.pdmodel.PDDocumentCatalog;
|
import org.apache.pdfbox.pdmodel.PDDocumentCatalog;
|
||||||
import org.apache.pdfbox.pdmodel.PDPage;
|
import org.apache.pdfbox.pdmodel.PDPage;
|
||||||
@ -40,7 +38,6 @@ import lombok.extern.slf4j.Slf4j;
|
|||||||
@RequiredArgsConstructor
|
@RequiredArgsConstructor
|
||||||
public class ViewerDocumentService {
|
public class ViewerDocumentService {
|
||||||
|
|
||||||
|
|
||||||
private static final String LAYER_NAME = "Layout grid";
|
private static final String LAYER_NAME = "Layout grid";
|
||||||
private static final int FONT_SIZE = 10;
|
private static final int FONT_SIZE = 10;
|
||||||
public static final float LINE_WIDTH = 1f;
|
public static final float LINE_WIDTH = 1f;
|
||||||
@ -54,8 +51,7 @@ public class ViewerDocumentService {
|
|||||||
LayoutGrid layoutGrid = layoutGridService.createLayoutGrid(document);
|
LayoutGrid layoutGrid = layoutGridService.createLayoutGrid(document);
|
||||||
// PDDocument.save() is very slow, since it actually traverses the entire pdf and writes a new one.
|
// PDDocument.save() is very slow, since it actually traverses the entire pdf and writes a new one.
|
||||||
// If we collect all COSDictionaries we changed and tell it explicitly to only add the changed ones by using saveIncremental it's very fast.
|
// If we collect all COSDictionaries we changed and tell it explicitly to only add the changed ones by using saveIncremental it's very fast.
|
||||||
Set<COSDictionary> dictionariesToUpdate = new HashSet<>();
|
PDOptionalContentGroup layer = addLayerToDocument(pdDocument, layerVisibilityDefaultValue);
|
||||||
PDOptionalContentGroup layer = addLayerToDocument(pdDocument, dictionariesToUpdate, layerVisibilityDefaultValue);
|
|
||||||
PDFont font = new PDType1Font(Standard14Fonts.FontName.HELVETICA);
|
PDFont font = new PDType1Font(Standard14Fonts.FontName.HELVETICA);
|
||||||
|
|
||||||
for (int pageNumber = 0; pageNumber < pdDocument.getNumberOfPages(); pageNumber++) {
|
for (int pageNumber = 0; pageNumber < pdDocument.getNumberOfPages(); pageNumber++) {
|
||||||
@ -68,7 +64,8 @@ public class ViewerDocumentService {
|
|||||||
// e.g. not escaped matrix transformations.
|
// e.g. not escaped matrix transformations.
|
||||||
escapePreviousContents(pdDocument, pdPage);
|
escapePreviousContents(pdDocument, pdPage);
|
||||||
|
|
||||||
VisualizationsOnPage visualizationsOnPage = layoutGrid.getVisualizationsPerPages().get(pageNumber);
|
VisualizationsOnPage visualizationsOnPage = layoutGrid.getVisualizationsPerPages()
|
||||||
|
.get(pageNumber);
|
||||||
assert pageNumber == visualizationsOnPage.getPageNumber();
|
assert pageNumber == visualizationsOnPage.getPageNumber();
|
||||||
// We need to append to the content stream, otherwise the content could be overlapped by following content.
|
// We need to append to the content stream, otherwise the content could be overlapped by following content.
|
||||||
try (var contentStream = new PDPageContentStream(pdDocument, pdPage, PDPageContentStream.AppendMode.APPEND, true)) {
|
try (var contentStream = new PDPageContentStream(pdDocument, pdPage, PDPageContentStream.AppendMode.APPEND, true)) {
|
||||||
@ -102,11 +99,11 @@ public class ViewerDocumentService {
|
|||||||
contentStream.setFont(font, FONT_SIZE);
|
contentStream.setFont(font, FONT_SIZE);
|
||||||
contentStream.beginText();
|
contentStream.beginText();
|
||||||
Matrix textMatrix = new Matrix((float) textDeRotationMatrix.getScaleX(),
|
Matrix textMatrix = new Matrix((float) textDeRotationMatrix.getScaleX(),
|
||||||
(float) textDeRotationMatrix.getShearX(),
|
(float) textDeRotationMatrix.getShearX(),
|
||||||
(float) textDeRotationMatrix.getShearY(),
|
(float) textDeRotationMatrix.getShearY(),
|
||||||
(float) textDeRotationMatrix.getScaleY(),
|
(float) textDeRotationMatrix.getScaleY(),
|
||||||
(float) placedText.lineStart().getX(),
|
(float) placedText.lineStart().getX(),
|
||||||
(float) placedText.lineStart().getY());
|
(float) placedText.lineStart().getY());
|
||||||
textMatrix.translate(-((font.getStringWidth(placedText.text()) / 1000) * FONT_SIZE + (2 * LINE_WIDTH) + 4), -FONT_SIZE);
|
textMatrix.translate(-((font.getStringWidth(placedText.text()) / 1000) * FONT_SIZE + (2 * LINE_WIDTH) + 4), -FONT_SIZE);
|
||||||
contentStream.setTextMatrix(textMatrix);
|
contentStream.setTextMatrix(textMatrix);
|
||||||
contentStream.showText(placedText.text());
|
contentStream.showText(placedText.text());
|
||||||
@ -115,12 +112,9 @@ public class ViewerDocumentService {
|
|||||||
contentStream.restoreGraphicsState();
|
contentStream.restoreGraphicsState();
|
||||||
contentStream.endMarkedContent();
|
contentStream.endMarkedContent();
|
||||||
}
|
}
|
||||||
dictionariesToUpdate.add(pdPage.getCOSObject());
|
|
||||||
dictionariesToUpdate.add(pdPage.getResources().getCOSObject());
|
|
||||||
}
|
}
|
||||||
dictionariesToUpdate.add(pdDocument.getDocumentInformation().getCOSObject());
|
|
||||||
// dictionariesToUpdate.add(pdDocument.getDocument().getTrailer());
|
// dictionariesToUpdate.add(pdDocument.getDocument().getTrailer());
|
||||||
pdDocument.saveIncremental(outputStream, dictionariesToUpdate);
|
pdDocument.save(outputStream, CompressParameters.NO_COMPRESSION);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@ -145,7 +139,7 @@ public class ViewerDocumentService {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
private static PDOptionalContentGroup addLayerToDocument(PDDocument pdDocument, Set<COSDictionary> dictionariesToUpdate, boolean layerVisibilityDefaultValue) {
|
private static PDOptionalContentGroup addLayerToDocument(PDDocument pdDocument, boolean layerVisibilityDefaultValue) {
|
||||||
|
|
||||||
PDDocumentCatalog catalog = pdDocument.getDocumentCatalog();
|
PDDocumentCatalog catalog = pdDocument.getDocumentCatalog();
|
||||||
PDOptionalContentProperties ocprops = catalog.getOCProperties();
|
PDOptionalContentProperties ocprops = catalog.getOCProperties();
|
||||||
@ -161,7 +155,6 @@ public class ViewerDocumentService {
|
|||||||
ocprops.addGroup(layer);
|
ocprops.addGroup(layer);
|
||||||
}
|
}
|
||||||
ocprops.setGroupEnabled(layer, layerVisibilityDefaultValue);
|
ocprops.setGroupEnabled(layer, layerVisibilityDefaultValue);
|
||||||
dictionariesToUpdate.add(catalog.getCOSObject());
|
|
||||||
return layer;
|
return layer;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@ -0,0 +1,223 @@
|
|||||||
|
package com.knecon.fforesight.service.layoutparser.processor.utils;
|
||||||
|
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.HashMap;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.Map;
|
||||||
|
import java.util.stream.Collectors;
|
||||||
|
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationDocument;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
|
||||||
|
|
||||||
|
import lombok.experimental.UtilityClass;
|
||||||
|
|
||||||
|
@UtilityClass
|
||||||
|
public class HeaderFooterDetection {
|
||||||
|
|
||||||
|
private final Map<Integer, ClassificationPage> pagesCache = new HashMap<>();
|
||||||
|
private static final double THRESHOLD = 0.5;
|
||||||
|
// Weight will go from 1.0 to 0.5 because the first element is the most likely to be the header on the page.
|
||||||
|
private static final double[] headerWeights = {1.0, 0.75, 0.5};
|
||||||
|
// Weight will go from 0.5 to 1.0 because the last element is the most likely to be the footer on the page.
|
||||||
|
private static final double[] footerWeights = {0.5, 0.75, 1.0};
|
||||||
|
|
||||||
|
|
||||||
|
public boolean isLikelyFooter(TextPageBlock textPageBlock, ClassificationDocument document, ClassificationPage classificationPage) {
|
||||||
|
|
||||||
|
int numberOfPages = document.getPages().size();
|
||||||
|
if (numberOfPages < 3) {
|
||||||
|
// If the document has 1 or 2 pages this may lead to more false positives than actual findings.
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
int window = Math.min(numberOfPages, 8);
|
||||||
|
|
||||||
|
List<ClassificationPage> nearestPages = findNearestPages(classificationPage, document.getPages(), window);
|
||||||
|
List<List<AbstractPageBlock>> footerCandidates = getFooterCandidates(nearestPages);
|
||||||
|
|
||||||
|
return detectHeadersOrFootersByPageAssociation(textPageBlock.getText(), footerCandidates, window, footerWeights);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public boolean isLikelyHeader(TextPageBlock textPageBlock, ClassificationDocument document, ClassificationPage classificationPage) {
|
||||||
|
|
||||||
|
int numberOfPages = document.getPages().size();
|
||||||
|
if (numberOfPages < 3) {
|
||||||
|
// If the document has 1 or 2 pages this may lead to more false positives than actual findings.
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
int window = Math.min(numberOfPages, 8);
|
||||||
|
|
||||||
|
List<ClassificationPage> nearestPages = findNearestPages(classificationPage, document.getPages(), window);
|
||||||
|
List<List<AbstractPageBlock>> headerCandidates = getHeaderCandidates(nearestPages);
|
||||||
|
|
||||||
|
return detectHeadersOrFootersByPageAssociation(textPageBlock.getText(), headerCandidates, window, headerWeights);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private boolean detectHeadersOrFootersByPageAssociation(String testString, List<List<AbstractPageBlock>> candidates, int window, double[] weights) {
|
||||||
|
|
||||||
|
double highestScore = 0.0;
|
||||||
|
|
||||||
|
for (int i = 0; i < candidates.size(); i++) {
|
||||||
|
List<List<String>> candidateStrings = new ArrayList<>();
|
||||||
|
for (int k = Math.max(i - window, 0); k < Math.min(i + window, candidates.size()); k++) {
|
||||||
|
candidateStrings.add(candidates.get(k)
|
||||||
|
.stream()
|
||||||
|
.map(AbstractPageBlock::getText)
|
||||||
|
.collect(Collectors.toList()));
|
||||||
|
}
|
||||||
|
|
||||||
|
int maxLen = candidateStrings.stream()
|
||||||
|
.mapToInt(List::size)
|
||||||
|
.max()
|
||||||
|
.orElse(0);
|
||||||
|
for (List<String> sublist : candidateStrings) {
|
||||||
|
while (sublist.size() < maxLen) {
|
||||||
|
sublist.add(0, "");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Compare the testString against each candidate in the window
|
||||||
|
for (int j = 0; j < maxLen; j++) {
|
||||||
|
double score = 0.0;
|
||||||
|
int finalJ = j;
|
||||||
|
List<String> paddedCandidateStrings = candidateStrings.stream()
|
||||||
|
.map(sublist -> sublist.size() > finalJ ? sublist.get(finalJ) : "")
|
||||||
|
.toList();
|
||||||
|
for (String paddedString : paddedCandidateStrings) {
|
||||||
|
if ((testString.length() >= 5 && paddedString.length() >= 5) && (testString.length() > 2 * paddedString.length()
|
||||||
|
|| paddedString.length() > 2 * testString.length())) {
|
||||||
|
// If both strings are at least 5 characters long and one string is more than twice as long as the other,
|
||||||
|
// skip the distance calculation as it's time-consuming, and we can assume they are not similar enough
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
int distance = calculateHammingDistanceWithPreprocessing(testString, paddedString);
|
||||||
|
double normalizedScore = 1 - (double) distance / Math.max(testString.length(), paddedString.length());
|
||||||
|
score += normalizedScore * (j < weights.length ? weights[j] : 1);
|
||||||
|
}
|
||||||
|
score /= paddedCandidateStrings.size();
|
||||||
|
highestScore = Math.max(highestScore, score);
|
||||||
|
// Early stop
|
||||||
|
if (highestScore > THRESHOLD) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Find the nearest n pages for a given page.
|
||||||
|
* For example: nearest 8 pages for page 4 are: 1, 2, 3, 5, 6, 7, 8, 9.
|
||||||
|
*
|
||||||
|
* @param currentPage Current page to find the nearest ones.
|
||||||
|
* @param allPages All pages in the document.
|
||||||
|
* @param numNeighbors Number of neighbouring pages to find.
|
||||||
|
* @return The nearest pages.
|
||||||
|
*/
|
||||||
|
private List<ClassificationPage> findNearestPages(ClassificationPage currentPage, List<ClassificationPage> allPages, int numNeighbors) {
|
||||||
|
|
||||||
|
int totalPages = allPages.size();
|
||||||
|
List<ClassificationPage> nearestPages = new ArrayList<>();
|
||||||
|
|
||||||
|
int currentPageIndex = currentPage.getPageNumber() - 1;
|
||||||
|
int halfWin = numNeighbors / 2;
|
||||||
|
int start = Math.max(0, currentPageIndex - halfWin);
|
||||||
|
int end = Math.min(totalPages - 1, currentPageIndex + halfWin);
|
||||||
|
|
||||||
|
for (int i = start; i <= end; i++) {
|
||||||
|
if (i != currentPageIndex) {
|
||||||
|
nearestPages.add(pagesCache.computeIfAbsent(i, allPages::get));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pagesCache.keySet().removeIf(key -> key < start || key > end);
|
||||||
|
|
||||||
|
return nearestPages;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
// Get the last 3 TextBlocks on the page as they are likely to be a footer
|
||||||
|
private List<List<AbstractPageBlock>> getFooterCandidates(List<ClassificationPage> pages) {
|
||||||
|
|
||||||
|
List<List<AbstractPageBlock>> footerCandidates = new ArrayList<>();
|
||||||
|
for (ClassificationPage page : pages) {
|
||||||
|
List<AbstractPageBlock> textBlocks = page.getTextBlocks();
|
||||||
|
List<TextPageBlock> textPageBlocks = textBlocks.stream()
|
||||||
|
.filter(textBlock -> textBlock instanceof TextPageBlock)
|
||||||
|
.map(textBlock -> (TextPageBlock) textBlock)
|
||||||
|
.toList();
|
||||||
|
int blockCount = textPageBlocks.size();
|
||||||
|
if (blockCount > 0) {
|
||||||
|
int start = Math.max(0, blockCount - 3);
|
||||||
|
footerCandidates.add(new ArrayList<>(textPageBlocks.subList(start, blockCount)));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return footerCandidates;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
// Get the first 3 TextBlocks on the page as they are likely to be a header
|
||||||
|
private List<List<AbstractPageBlock>> getHeaderCandidates(List<ClassificationPage> pages) {
|
||||||
|
|
||||||
|
List<List<AbstractPageBlock>> headerCandidates = new ArrayList<>();
|
||||||
|
for (ClassificationPage page : pages) {
|
||||||
|
List<AbstractPageBlock> textBlocks = page.getTextBlocks();
|
||||||
|
List<TextPageBlock> textPageBlocks = textBlocks.stream()
|
||||||
|
.filter(textBlock -> textBlock instanceof TextPageBlock)
|
||||||
|
.map(textBlock -> (TextPageBlock) textBlock)
|
||||||
|
.toList();
|
||||||
|
int count = Math.min(3, textPageBlocks.size());
|
||||||
|
headerCandidates.add(new ArrayList<>(textPageBlocks.subList(0, count)));
|
||||||
|
}
|
||||||
|
return headerCandidates;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Calculate the Hamming distance between two strings after preprocessing to make them the same length
|
||||||
|
* and replacing all digits with a special character '@' since they are a common occurrence in headers/footers.
|
||||||
|
*
|
||||||
|
* @param firstCandidate First string
|
||||||
|
* @param secondCandidate Second string
|
||||||
|
* @return The Hamming distance between the two preprocessed strings.
|
||||||
|
*/
|
||||||
|
private int calculateHammingDistanceWithPreprocessing(String firstCandidate, String secondCandidate) {
|
||||||
|
|
||||||
|
int maxLength = Math.max(firstCandidate.length(), secondCandidate.length());
|
||||||
|
|
||||||
|
String cleanFirstCandidate = padString(firstCandidate, maxLength, '\0').replaceAll("\\d", "@");
|
||||||
|
String cleanSecondCandidate = padString(secondCandidate, maxLength, '\0').replaceAll("\\d", "@");
|
||||||
|
|
||||||
|
int distance = 0;
|
||||||
|
for (int i = 0; i < maxLength; i++) {
|
||||||
|
if (cleanFirstCandidate.charAt(i) != cleanSecondCandidate.charAt(i)) {
|
||||||
|
distance++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return distance;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private String padString(String input, int length, char padChar) {
|
||||||
|
|
||||||
|
if (input.length() >= length) {
|
||||||
|
return input;
|
||||||
|
}
|
||||||
|
|
||||||
|
StringBuilder sb = new StringBuilder(input);
|
||||||
|
|
||||||
|
while (sb.length() < length) {
|
||||||
|
sb.append(padChar);
|
||||||
|
}
|
||||||
|
return sb.toString();
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
@ -28,15 +28,13 @@ import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPosit
|
|||||||
*
|
*
|
||||||
* @author Ben Litchfield
|
* @author Ben Litchfield
|
||||||
*/
|
*/
|
||||||
public class TextPositionSequenceComparator implements Comparator<TextPositionSequence>
|
public class TextPositionSequenceComparator implements Comparator<TextPositionSequence> {
|
||||||
{
|
|
||||||
@Override
|
@Override
|
||||||
public int compare(TextPositionSequence pos1, TextPositionSequence pos2)
|
public int compare(TextPositionSequence pos1, TextPositionSequence pos2) {
|
||||||
{
|
|
||||||
// only compare text that is in the same direction
|
// only compare text that is in the same direction
|
||||||
int cmp1 = Float.compare(pos1.getDir().getDegrees(), pos2.getDir().getDegrees());
|
int cmp1 = Float.compare(pos1.getDir().getDegrees(), pos2.getDir().getDegrees());
|
||||||
if (cmp1 != 0)
|
if (cmp1 != 0) {
|
||||||
{
|
|
||||||
return cmp1;
|
return cmp1;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -48,25 +46,19 @@ public class TextPositionSequenceComparator implements Comparator<TextPositionSe
|
|||||||
float pos2YBottom = pos2.getMaxYDirAdj();
|
float pos2YBottom = pos2.getMaxYDirAdj();
|
||||||
|
|
||||||
// note that the coordinates have been adjusted so 0,0 is in upper left
|
// note that the coordinates have been adjusted so 0,0 is in upper left
|
||||||
float pos1YTop = pos1YBottom - pos1.getTextHeight();
|
float pos1YTop = pos1YBottom - pos1.getTextHeightNoPadding();
|
||||||
float pos2YTop = pos2YBottom - pos2.getTextHeight();
|
float pos2YTop = pos2YBottom - pos2.getTextHeightNoPadding();
|
||||||
|
|
||||||
float yDifference = Math.abs(pos1YBottom - pos2YBottom);
|
float yDifference = Math.abs(pos1YBottom - pos2YBottom);
|
||||||
|
|
||||||
// we will do a simple tolerance comparison
|
// we will do a simple tolerance comparison
|
||||||
if (yDifference < .1 ||
|
if (yDifference < .1 || pos2YBottom >= pos1YTop && pos2YBottom <= pos1YBottom || pos1YBottom >= pos2YTop && pos1YBottom <= pos2YBottom) {
|
||||||
pos2YBottom >= pos1YTop && pos2YBottom <= pos1YBottom ||
|
|
||||||
pos1YBottom >= pos2YTop && pos1YBottom <= pos2YBottom)
|
|
||||||
{
|
|
||||||
return Float.compare(x1, x2);
|
return Float.compare(x1, x2);
|
||||||
}
|
} else if (pos1YBottom < pos2YBottom) {
|
||||||
else if (pos1YBottom < pos2YBottom)
|
|
||||||
{
|
|
||||||
return -1;
|
return -1;
|
||||||
}
|
} else {
|
||||||
else
|
|
||||||
{
|
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|||||||
@ -6,7 +6,7 @@ plugins {
|
|||||||
id("org.springframework.boot") version "3.1.3"
|
id("org.springframework.boot") version "3.1.3"
|
||||||
id("io.spring.dependency-management") version "1.1.3"
|
id("io.spring.dependency-management") version "1.1.3"
|
||||||
id("org.sonarqube") version "4.3.0.3225"
|
id("org.sonarqube") version "4.3.0.3225"
|
||||||
id("io.freefair.lombok") version "8.2.2"
|
id("io.freefair.lombok") version "8.6"
|
||||||
// id("org.graalvm.buildtools.native") version "0.9.23"
|
// id("org.graalvm.buildtools.native") version "0.9.23"
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
Binary file not shown.
Loading…
x
Reference in New Issue
Block a user