RED-7461: Fixed pr findings
This commit is contained in:
parent
b270b9c942
commit
0cb8029f0a
@ -10,12 +10,11 @@ import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.MarkedContentConverter;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.MarkedContentUtils;
|
||||
import org.apache.pdfbox.pdmodel.PDDocument;
|
||||
import org.apache.pdfbox.pdmodel.PDPage;
|
||||
import org.apache.pdfbox.pdmodel.common.PDRectangle;
|
||||
import org.apache.pdfbox.pdmodel.documentinterchange.markedcontent.PDMarkedContent;
|
||||
import org.apache.pdfbox.text.PDFMarkedContentExtractor;
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingFinishedEvent;
|
||||
@ -167,9 +166,12 @@ public class LayoutParsingPipeline {
|
||||
stripper.getMaxCharHeight());
|
||||
|
||||
ClassificationPage classificationPage = switch (layoutParsingType) {
|
||||
case REDACT_MANAGER -> redactManagerBlockificationService.blockify(stripper.getTextPositionSequences(), cleanRulings.getHorizontal(), cleanRulings.getVertical());
|
||||
case TAAS -> taasBlockificationService.blockify(stripper.getTextPositionSequences(), cleanRulings.getHorizontal(), cleanRulings.getVertical());
|
||||
case DOCUMINE -> docuMineBlockificationService.blockify(stripper.getTextPositionSequences(), cleanRulings.getHorizontal(), cleanRulings.getVertical());
|
||||
case REDACT_MANAGER ->
|
||||
redactManagerBlockificationService.blockify(stripper.getTextPositionSequences(), cleanRulings.getHorizontal(), cleanRulings.getVertical());
|
||||
case TAAS ->
|
||||
taasBlockificationService.blockify(stripper.getTextPositionSequences(), cleanRulings.getHorizontal(), cleanRulings.getVertical());
|
||||
case DOCUMINE ->
|
||||
docuMineBlockificationService.blockify(stripper.getTextPositionSequences(), cleanRulings.getHorizontal(), cleanRulings.getVertical());
|
||||
};
|
||||
classificationPage.setCleanRulings(cleanRulings);
|
||||
classificationPage.setRotation(rotation);
|
||||
@ -209,10 +211,10 @@ public class LayoutParsingPipeline {
|
||||
}
|
||||
|
||||
|
||||
private Map<String, Rectangle2D> convertMarkedContents(List<PDMarkedContent> pdMarkedContents){
|
||||
private Map<String, Rectangle2D> convertMarkedContents(List<PDMarkedContent> pdMarkedContents) {
|
||||
Map<String, Rectangle2D> markedContentBboxes = new HashMap<>();
|
||||
markedContentBboxes.put("Header", MarkedContentConverter.getMarkedContentBboxPerLine(pdMarkedContents, "Header"));
|
||||
markedContentBboxes.put("Footer", MarkedContentConverter.getMarkedContentBboxPerLine(pdMarkedContents, "Footer"));
|
||||
markedContentBboxes.put(MarkedContentUtils.HEADER, MarkedContentUtils.getMarkedContentBboxPerLine(pdMarkedContents, MarkedContentUtils.HEADER));
|
||||
markedContentBboxes.put(MarkedContentUtils.FOOTER, MarkedContentUtils.getMarkedContentBboxPerLine(pdMarkedContents, MarkedContentUtils.FOOTER));
|
||||
return markedContentBboxes;
|
||||
}
|
||||
|
||||
|
||||
@ -73,6 +73,7 @@ public class TextPageBlock extends AbstractPageBlock {
|
||||
|
||||
return sequences.get(0).getPageWidth();
|
||||
}
|
||||
|
||||
|
||||
public static TextPageBlock merge(List<TextPageBlock> textBlocksToMerge) {
|
||||
|
||||
|
||||
@ -2,6 +2,7 @@ package com.knecon.fforesight.service.layoutparser.processor.services;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.MarkedContentUtils;
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.Point;
|
||||
@ -156,8 +157,8 @@ public class BodyTextFrameService {
|
||||
continue;
|
||||
}
|
||||
|
||||
if (page.getMarkedContentBboxPerType().get("Header") != null && page.getMarkedContentBboxPerType().get("Header").intersects(textBlock.getPdfMinX(), textBlock.getPdfMinY(), textBlock.getWidth(), textBlock.getHeight())
|
||||
|| page.getMarkedContentBboxPerType().get("Footer") != null && page.getMarkedContentBboxPerType().get("Footer").intersects(textBlock.getPdfMinX(), textBlock.getPdfMinY(), textBlock.getWidth(), textBlock.getHeight())) {
|
||||
if (MarkedContentUtils.intersects(textBlock, page.getMarkedContentBboxPerType(), MarkedContentUtils.HEADER)
|
||||
|| MarkedContentUtils.intersects(textBlock, page.getMarkedContentBboxPerType(), MarkedContentUtils.FOOTER)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
|
||||
@ -5,6 +5,7 @@ import java.util.Locale;
|
||||
import java.util.regex.Matcher;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.MarkedContentUtils;
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
|
||||
@ -62,13 +63,13 @@ public class DocuMineClassificationService {
|
||||
textBlock.setClassification(PageBlockType.OTHER);
|
||||
return;
|
||||
}
|
||||
if (page.getMarkedContentBboxPerType().get("Header") != null && page.getMarkedContentBboxPerType().get("Header").intersects(textBlock.getPdfMinX(), textBlock.getPdfMinY(), textBlock.getWidth(), textBlock.getHeight())
|
||||
if (MarkedContentUtils.intersects(textBlock, page.getMarkedContentBboxPerType(), MarkedContentUtils.HEADER)
|
||||
|| PositionUtils.isOverBodyTextFrame(bodyTextFrame, textBlock, page.getRotation()) && (document.getFontSizeCounter()
|
||||
.getMostPopular() == null || textBlock.getHighestFontSize() <= document.getFontSizeCounter().getMostPopular())
|
||||
) {
|
||||
textBlock.setClassification(PageBlockType.HEADER);
|
||||
|
||||
} else if (page.getMarkedContentBboxPerType().get("Footer") != null && page.getMarkedContentBboxPerType().get("Footer").intersects(textBlock.getPdfMinX(), textBlock.getPdfMinY(), textBlock.getWidth(), textBlock.getHeight())
|
||||
} else if (MarkedContentUtils.intersects(textBlock, page.getMarkedContentBboxPerType(), MarkedContentUtils.FOOTER)
|
||||
|| PositionUtils.isUnderBodyTextFrame(bodyTextFrame, textBlock, page.getRotation()) && (document.getFontSizeCounter()
|
||||
.getMostPopular() == null || textBlock.getHighestFontSize() <= document.getFontSizeCounter().getMostPopular())
|
||||
) {
|
||||
|
||||
@ -3,6 +3,7 @@ package com.knecon.fforesight.service.layoutparser.processor.services.classifica
|
||||
import java.util.List;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.MarkedContentUtils;
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
|
||||
@ -51,12 +52,12 @@ public class RedactManagerClassificationService {
|
||||
textBlock.setClassification(PageBlockType.OTHER);
|
||||
return;
|
||||
}
|
||||
if (page.getMarkedContentBboxPerType().get("Header") != null && page.getMarkedContentBboxPerType().get("Header").intersects(textBlock.getPdfMinX(), textBlock.getPdfMinY(), textBlock.getWidth(), textBlock.getHeight())
|
||||
if (MarkedContentUtils.intersects(textBlock, page.getMarkedContentBboxPerType(), MarkedContentUtils.HEADER)
|
||||
|| PositionUtils.isOverBodyTextFrame(bodyTextFrame, textBlock, page.getRotation()) && (document.getFontSizeCounter()
|
||||
.getMostPopular() == null || textBlock.getHighestFontSize() <= document.getFontSizeCounter().getMostPopular())) {
|
||||
textBlock.setClassification(PageBlockType.HEADER);
|
||||
|
||||
} else if (page.getMarkedContentBboxPerType().get("Footer") != null && page.getMarkedContentBboxPerType().get("Footer").intersects(textBlock.getPdfMinX(), textBlock.getPdfMinY(), textBlock.getWidth(), textBlock.getHeight())
|
||||
} else if (MarkedContentUtils.intersects(textBlock, page.getMarkedContentBboxPerType(), MarkedContentUtils.FOOTER)
|
||||
|| PositionUtils.isUnderBodyTextFrame(bodyTextFrame, textBlock, page.getRotation()) && (document.getFontSizeCounter()
|
||||
.getMostPopular() == null || textBlock.getHighestFontSize() <= document.getFontSizeCounter().getMostPopular())) {
|
||||
textBlock.setClassification(PageBlockType.FOOTER);
|
||||
|
||||
@ -3,6 +3,7 @@ package com.knecon.fforesight.service.layoutparser.processor.services.classifica
|
||||
import java.util.List;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.MarkedContentUtils;
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
|
||||
@ -56,10 +57,10 @@ public class TaasClassificationService {
|
||||
textBlock.setClassification(PageBlockType.OTHER);
|
||||
return;
|
||||
}
|
||||
if (page.getMarkedContentBboxPerType().get("Header") != null && page.getMarkedContentBboxPerType().get("Header").intersects(textBlock.getPdfMinX(), textBlock.getPdfMinY(), textBlock.getWidth(), textBlock.getHeight())
|
||||
if (MarkedContentUtils.intersects(textBlock, page.getMarkedContentBboxPerType(), MarkedContentUtils.HEADER)
|
||||
|| PositionUtils.isOverBodyTextFrame(bodyTextFrame, textBlock, page.getRotation())) {
|
||||
textBlock.setClassification(PageBlockType.HEADER);
|
||||
} else if (page.getMarkedContentBboxPerType().get("Footer") != null && page.getMarkedContentBboxPerType().get("Footer").intersects(textBlock.getPdfMinX(), textBlock.getPdfMinY(), textBlock.getWidth(), textBlock.getHeight())
|
||||
} else if (MarkedContentUtils.intersects(textBlock, page.getMarkedContentBboxPerType(), MarkedContentUtils.FOOTER)
|
||||
|| PositionUtils.isUnderBodyTextFrame(bodyTextFrame, textBlock, page.getRotation())) {
|
||||
textBlock.setClassification(PageBlockType.FOOTER);
|
||||
} else if (page.getPageNumber() == 1 && (PositionUtils.getHeightDifferenceBetweenChunkWordAndDocumentWord(textBlock,
|
||||
|
||||
@ -10,7 +10,6 @@ import lombok.extern.slf4j.Slf4j;
|
||||
import org.apache.pdfbox.contentstream.operator.Operator;
|
||||
import org.apache.pdfbox.contentstream.operator.OperatorName;
|
||||
import org.apache.pdfbox.contentstream.operator.color.*;
|
||||
import org.apache.pdfbox.contentstream.operator.markedcontent.BeginMarkedContentSequence;
|
||||
import org.apache.pdfbox.contentstream.operator.markedcontent.BeginMarkedContentSequenceWithProperties;
|
||||
import org.apache.pdfbox.contentstream.operator.markedcontent.EndMarkedContentSequence;
|
||||
import org.apache.pdfbox.contentstream.operator.state.*;
|
||||
@ -76,14 +75,12 @@ public class PDFLinesTextStripper extends PDFTextStripper {
|
||||
|
||||
|
||||
addOperator(new BeginMarkedContentSequenceWithProperties());
|
||||
addOperator(new BeginMarkedContentSequence());
|
||||
// addOperator(new BeginMarkedContentSequence());
|
||||
addOperator(new EndMarkedContentSequence());
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
@Override
|
||||
protected void processOperator(Operator operator, List<COSBase> arguments) throws IOException {
|
||||
|
||||
|
||||
@ -1,5 +1,6 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.utils;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
|
||||
import lombok.experimental.UtilityClass;
|
||||
import org.apache.pdfbox.cos.COSName;
|
||||
@ -9,11 +10,14 @@ import org.apache.pdfbox.text.TextPosition;
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.util.Collection;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
@UtilityClass
|
||||
public class MarkedContentConverter {
|
||||
public class MarkedContentUtils {
|
||||
|
||||
public static final String HEADER = "Header";
|
||||
public static final String FOOTER = "Footer";
|
||||
|
||||
public Rectangle2D getMarkedContentBboxPerLine(List<PDMarkedContent> markedContents, String subtype) {
|
||||
|
||||
@ -42,4 +46,10 @@ public class MarkedContentConverter {
|
||||
.getRectangle())
|
||||
.map(t -> new Rectangle2D.Float(t.getTopLeft().getX(), t.getTopLeft().getY() - Math.abs(t.getHeight()), t.getWidth(), Math.abs(t.getHeight()))).collect(Collectors.toList()));
|
||||
}
|
||||
|
||||
|
||||
public boolean intersects(TextPageBlock textBlock, Map<String, Rectangle2D> markedContentBboxPerType, String type) {
|
||||
return markedContentBboxPerType.get(type) != null && markedContentBboxPerType.get(type).intersects(textBlock.getPdfMinX(), textBlock.getPdfMinY(), textBlock.getWidth(), textBlock.getHeight());
|
||||
}
|
||||
|
||||
}
|
||||
@ -40,7 +40,7 @@ public class BuildDocumentGraphTest extends AbstractTest {
|
||||
@SneakyThrows
|
||||
protected Document buildGraph(String filename) {
|
||||
|
||||
return buildGraph(filename, LayoutParsingType.DOCUMINE);
|
||||
return buildGraph(filename, LayoutParsingType.REDACT_MANAGER);
|
||||
}
|
||||
|
||||
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user