RED-7461: Fixed pr findings

This commit is contained in:
deiflaender 2023-08-21 16:57:37 +02:00
parent b270b9c942
commit 0cb8029f0a
9 changed files with 36 additions and 22 deletions

View File

@ -10,12 +10,11 @@ import java.util.HashMap;
import java.util.List;
import java.util.Map;
import com.knecon.fforesight.service.layoutparser.processor.utils.MarkedContentConverter;
import com.knecon.fforesight.service.layoutparser.processor.utils.MarkedContentUtils;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.common.PDRectangle;
import org.apache.pdfbox.pdmodel.documentinterchange.markedcontent.PDMarkedContent;
import org.apache.pdfbox.text.PDFMarkedContentExtractor;
import org.springframework.stereotype.Service;
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingFinishedEvent;
@ -167,9 +166,12 @@ public class LayoutParsingPipeline {
stripper.getMaxCharHeight());
ClassificationPage classificationPage = switch (layoutParsingType) {
case REDACT_MANAGER -> redactManagerBlockificationService.blockify(stripper.getTextPositionSequences(), cleanRulings.getHorizontal(), cleanRulings.getVertical());
case TAAS -> taasBlockificationService.blockify(stripper.getTextPositionSequences(), cleanRulings.getHorizontal(), cleanRulings.getVertical());
case DOCUMINE -> docuMineBlockificationService.blockify(stripper.getTextPositionSequences(), cleanRulings.getHorizontal(), cleanRulings.getVertical());
case REDACT_MANAGER ->
redactManagerBlockificationService.blockify(stripper.getTextPositionSequences(), cleanRulings.getHorizontal(), cleanRulings.getVertical());
case TAAS ->
taasBlockificationService.blockify(stripper.getTextPositionSequences(), cleanRulings.getHorizontal(), cleanRulings.getVertical());
case DOCUMINE ->
docuMineBlockificationService.blockify(stripper.getTextPositionSequences(), cleanRulings.getHorizontal(), cleanRulings.getVertical());
};
classificationPage.setCleanRulings(cleanRulings);
classificationPage.setRotation(rotation);
@ -209,10 +211,10 @@ public class LayoutParsingPipeline {
}
private Map<String, Rectangle2D> convertMarkedContents(List<PDMarkedContent> pdMarkedContents){
private Map<String, Rectangle2D> convertMarkedContents(List<PDMarkedContent> pdMarkedContents) {
Map<String, Rectangle2D> markedContentBboxes = new HashMap<>();
markedContentBboxes.put("Header", MarkedContentConverter.getMarkedContentBboxPerLine(pdMarkedContents, "Header"));
markedContentBboxes.put("Footer", MarkedContentConverter.getMarkedContentBboxPerLine(pdMarkedContents, "Footer"));
markedContentBboxes.put(MarkedContentUtils.HEADER, MarkedContentUtils.getMarkedContentBboxPerLine(pdMarkedContents, MarkedContentUtils.HEADER));
markedContentBboxes.put(MarkedContentUtils.FOOTER, MarkedContentUtils.getMarkedContentBboxPerLine(pdMarkedContents, MarkedContentUtils.FOOTER));
return markedContentBboxes;
}

View File

@ -73,6 +73,7 @@ public class TextPageBlock extends AbstractPageBlock {
return sequences.get(0).getPageWidth();
}
public static TextPageBlock merge(List<TextPageBlock> textBlocksToMerge) {

View File

@ -2,6 +2,7 @@ package com.knecon.fforesight.service.layoutparser.processor.services;
import java.util.List;
import com.knecon.fforesight.service.layoutparser.processor.utils.MarkedContentUtils;
import org.springframework.stereotype.Service;
import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.Point;
@ -156,8 +157,8 @@ public class BodyTextFrameService {
continue;
}
if (page.getMarkedContentBboxPerType().get("Header") != null && page.getMarkedContentBboxPerType().get("Header").intersects(textBlock.getPdfMinX(), textBlock.getPdfMinY(), textBlock.getWidth(), textBlock.getHeight())
|| page.getMarkedContentBboxPerType().get("Footer") != null && page.getMarkedContentBboxPerType().get("Footer").intersects(textBlock.getPdfMinX(), textBlock.getPdfMinY(), textBlock.getWidth(), textBlock.getHeight())) {
if (MarkedContentUtils.intersects(textBlock, page.getMarkedContentBboxPerType(), MarkedContentUtils.HEADER)
|| MarkedContentUtils.intersects(textBlock, page.getMarkedContentBboxPerType(), MarkedContentUtils.FOOTER)) {
continue;
}

View File

@ -5,6 +5,7 @@ import java.util.Locale;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import com.knecon.fforesight.service.layoutparser.processor.utils.MarkedContentUtils;
import org.springframework.stereotype.Service;
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
@ -62,13 +63,13 @@ public class DocuMineClassificationService {
textBlock.setClassification(PageBlockType.OTHER);
return;
}
if (page.getMarkedContentBboxPerType().get("Header") != null && page.getMarkedContentBboxPerType().get("Header").intersects(textBlock.getPdfMinX(), textBlock.getPdfMinY(), textBlock.getWidth(), textBlock.getHeight())
if (MarkedContentUtils.intersects(textBlock, page.getMarkedContentBboxPerType(), MarkedContentUtils.HEADER)
|| PositionUtils.isOverBodyTextFrame(bodyTextFrame, textBlock, page.getRotation()) && (document.getFontSizeCounter()
.getMostPopular() == null || textBlock.getHighestFontSize() <= document.getFontSizeCounter().getMostPopular())
) {
textBlock.setClassification(PageBlockType.HEADER);
} else if (page.getMarkedContentBboxPerType().get("Footer") != null && page.getMarkedContentBboxPerType().get("Footer").intersects(textBlock.getPdfMinX(), textBlock.getPdfMinY(), textBlock.getWidth(), textBlock.getHeight())
} else if (MarkedContentUtils.intersects(textBlock, page.getMarkedContentBboxPerType(), MarkedContentUtils.FOOTER)
|| PositionUtils.isUnderBodyTextFrame(bodyTextFrame, textBlock, page.getRotation()) && (document.getFontSizeCounter()
.getMostPopular() == null || textBlock.getHighestFontSize() <= document.getFontSizeCounter().getMostPopular())
) {

View File

@ -3,6 +3,7 @@ package com.knecon.fforesight.service.layoutparser.processor.services.classifica
import java.util.List;
import java.util.regex.Pattern;
import com.knecon.fforesight.service.layoutparser.processor.utils.MarkedContentUtils;
import org.springframework.stereotype.Service;
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
@ -51,12 +52,12 @@ public class RedactManagerClassificationService {
textBlock.setClassification(PageBlockType.OTHER);
return;
}
if (page.getMarkedContentBboxPerType().get("Header") != null && page.getMarkedContentBboxPerType().get("Header").intersects(textBlock.getPdfMinX(), textBlock.getPdfMinY(), textBlock.getWidth(), textBlock.getHeight())
if (MarkedContentUtils.intersects(textBlock, page.getMarkedContentBboxPerType(), MarkedContentUtils.HEADER)
|| PositionUtils.isOverBodyTextFrame(bodyTextFrame, textBlock, page.getRotation()) && (document.getFontSizeCounter()
.getMostPopular() == null || textBlock.getHighestFontSize() <= document.getFontSizeCounter().getMostPopular())) {
textBlock.setClassification(PageBlockType.HEADER);
} else if (page.getMarkedContentBboxPerType().get("Footer") != null && page.getMarkedContentBboxPerType().get("Footer").intersects(textBlock.getPdfMinX(), textBlock.getPdfMinY(), textBlock.getWidth(), textBlock.getHeight())
} else if (MarkedContentUtils.intersects(textBlock, page.getMarkedContentBboxPerType(), MarkedContentUtils.FOOTER)
|| PositionUtils.isUnderBodyTextFrame(bodyTextFrame, textBlock, page.getRotation()) && (document.getFontSizeCounter()
.getMostPopular() == null || textBlock.getHighestFontSize() <= document.getFontSizeCounter().getMostPopular())) {
textBlock.setClassification(PageBlockType.FOOTER);

View File

@ -3,6 +3,7 @@ package com.knecon.fforesight.service.layoutparser.processor.services.classifica
import java.util.List;
import java.util.regex.Pattern;
import com.knecon.fforesight.service.layoutparser.processor.utils.MarkedContentUtils;
import org.springframework.stereotype.Service;
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
@ -56,10 +57,10 @@ public class TaasClassificationService {
textBlock.setClassification(PageBlockType.OTHER);
return;
}
if (page.getMarkedContentBboxPerType().get("Header") != null && page.getMarkedContentBboxPerType().get("Header").intersects(textBlock.getPdfMinX(), textBlock.getPdfMinY(), textBlock.getWidth(), textBlock.getHeight())
if (MarkedContentUtils.intersects(textBlock, page.getMarkedContentBboxPerType(), MarkedContentUtils.HEADER)
|| PositionUtils.isOverBodyTextFrame(bodyTextFrame, textBlock, page.getRotation())) {
textBlock.setClassification(PageBlockType.HEADER);
} else if (page.getMarkedContentBboxPerType().get("Footer") != null && page.getMarkedContentBboxPerType().get("Footer").intersects(textBlock.getPdfMinX(), textBlock.getPdfMinY(), textBlock.getWidth(), textBlock.getHeight())
} else if (MarkedContentUtils.intersects(textBlock, page.getMarkedContentBboxPerType(), MarkedContentUtils.FOOTER)
|| PositionUtils.isUnderBodyTextFrame(bodyTextFrame, textBlock, page.getRotation())) {
textBlock.setClassification(PageBlockType.FOOTER);
} else if (page.getPageNumber() == 1 && (PositionUtils.getHeightDifferenceBetweenChunkWordAndDocumentWord(textBlock,

View File

@ -10,7 +10,6 @@ import lombok.extern.slf4j.Slf4j;
import org.apache.pdfbox.contentstream.operator.Operator;
import org.apache.pdfbox.contentstream.operator.OperatorName;
import org.apache.pdfbox.contentstream.operator.color.*;
import org.apache.pdfbox.contentstream.operator.markedcontent.BeginMarkedContentSequence;
import org.apache.pdfbox.contentstream.operator.markedcontent.BeginMarkedContentSequenceWithProperties;
import org.apache.pdfbox.contentstream.operator.markedcontent.EndMarkedContentSequence;
import org.apache.pdfbox.contentstream.operator.state.*;
@ -76,14 +75,12 @@ public class PDFLinesTextStripper extends PDFTextStripper {
addOperator(new BeginMarkedContentSequenceWithProperties());
addOperator(new BeginMarkedContentSequence());
// addOperator(new BeginMarkedContentSequence());
addOperator(new EndMarkedContentSequence());
}
@Override
protected void processOperator(Operator operator, List<COSBase> arguments) throws IOException {

View File

@ -1,5 +1,6 @@
package com.knecon.fforesight.service.layoutparser.processor.utils;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
import lombok.experimental.UtilityClass;
import org.apache.pdfbox.cos.COSName;
@ -9,11 +10,14 @@ import org.apache.pdfbox.text.TextPosition;
import java.awt.geom.Rectangle2D;
import java.util.Collection;
import java.util.List;
import java.util.Map;
import java.util.stream.Collectors;
@UtilityClass
public class MarkedContentConverter {
public class MarkedContentUtils {
public static final String HEADER = "Header";
public static final String FOOTER = "Footer";
public Rectangle2D getMarkedContentBboxPerLine(List<PDMarkedContent> markedContents, String subtype) {
@ -42,4 +46,10 @@ public class MarkedContentConverter {
.getRectangle())
.map(t -> new Rectangle2D.Float(t.getTopLeft().getX(), t.getTopLeft().getY() - Math.abs(t.getHeight()), t.getWidth(), Math.abs(t.getHeight()))).collect(Collectors.toList()));
}
public boolean intersects(TextPageBlock textBlock, Map<String, Rectangle2D> markedContentBboxPerType, String type) {
return markedContentBboxPerType.get(type) != null && markedContentBboxPerType.get(type).intersects(textBlock.getPdfMinX(), textBlock.getPdfMinY(), textBlock.getWidth(), textBlock.getHeight());
}
}

View File

@ -40,7 +40,7 @@ public class BuildDocumentGraphTest extends AbstractTest {
@SneakyThrows
protected Document buildGraph(String filename) {
return buildGraph(filename, LayoutParsingType.DOCUMINE);
return buildGraph(filename, LayoutParsingType.REDACT_MANAGER);
}