RED-8825: general improvements

* some more refactoring
This commit is contained in:
Kilian Schuettler 2024-04-29 20:09:53 +02:00
parent 64209255cb
commit 08be18db2d
10 changed files with 30 additions and 33 deletions

View File

@ -261,7 +261,7 @@ public class LayoutParsingPipeline {
boolean isLandscape = pdr.getWidth() > pdr.getHeight() && (rotation == 0 || rotation == 180) || pdr.getHeight() > pdr.getWidth() && (rotation == 90 || rotation == 270);
PDRectangle cropbox = pdPage.getCropBox();
CleanRulings cleanRulings = rulingCleaningService.getCleanRulings(pdfTableCells.get(pageNumber), stripper.getRulings());
CleanRulings cleanRulings = rulingCleaningService.deduplicateAndStraightenRulings(pdfTableCells.get(pageNumber), stripper.getRulings());
List<Cell> emptyTableCells = TableExtractionService.findCells(cleanRulings.getHorizontals(), cleanRulings.getVerticals());
classificationDocument.getVisualizations().addCellVisualizations(emptyTableCells, pageNumber);
@ -293,10 +293,10 @@ public class LayoutParsingPipeline {
classificationDocument.getVisualizations().addMarkedContentVisualizations(stripper.getMarkedContents(), pageNumber, pdPage);
// MarkedContent needs to be converted at this point, otherwise it leads to GC Problems in Pdfbox.
classificationPage.setMarkedContentBboxPerType(convertMarkedContents(stripper.getMarkedContents(), pdPage));
classificationPage.setMarkedContentBboxPerType(convertMarkedContents(stripper.getMarkedContents()));
// If images is ocr needs to be calculated before textBlocks are moved into tables, otherwise findOcr algorithm needs to be adopted.
if (pdfImages != null && pdfImages.containsKey(pageNumber)) {
if (pdfImages.containsKey(pageNumber)) {
classificationPage.setImages(pdfImages.get(pageNumber));
imageServiceResponseAdapter.findOcr(classificationPage);
}
@ -370,11 +370,11 @@ public class LayoutParsingPipeline {
}
private Map<String, List<Rectangle2D>> convertMarkedContents(List<PDMarkedContent> pdMarkedContents, PDPage pdPage) {
private Map<String, List<Rectangle2D>> convertMarkedContents(List<PDMarkedContent> pdMarkedContents) {
Map<String, List<Rectangle2D>> markedContentBboxes = new HashMap<>();
markedContentBboxes.put(MarkedContentUtils.HEADER, MarkedContentUtils.getMarkedContentBboxPerLine(pdMarkedContents, MarkedContentUtils.HEADER, pdPage));
markedContentBboxes.put(MarkedContentUtils.FOOTER, MarkedContentUtils.getMarkedContentBboxPerLine(pdMarkedContents, MarkedContentUtils.FOOTER, pdPage));
markedContentBboxes.put(MarkedContentUtils.HEADER, MarkedContentUtils.getMarkedContentBboxPerLine(pdMarkedContents, MarkedContentUtils.HEADER));
markedContentBboxes.put(MarkedContentUtils.FOOTER, MarkedContentUtils.getMarkedContentBboxPerLine(pdMarkedContents, MarkedContentUtils.FOOTER));
return markedContentBboxes;
}

View File

@ -9,6 +9,7 @@ import java.util.Map;
import org.springframework.stereotype.Service;
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.ImageType;
import com.knecon.fforesight.service.layoutparser.processor.model.image.ClassifiedImage;
@ -54,11 +55,12 @@ public class ImageServiceResponseAdapter {
classificationPage.getImages().forEach(image -> {
if (image.getImageType().equals(ImageType.OTHER)) {
classificationPage.getTextBlocks().forEach(textblock -> {
if (image.getPosition().contains(textblock.getMinX(), textblock.getMinY(), textblock.getWidth(), textblock.getHeight())) {
for (AbstractPageBlock textblock : classificationPage.getTextBlocks()) {
if (image.getPosition().contains(textblock.getBBox())) {
image.setImageType(ImageType.OCR);
return;
}
});
}
}
});
}

View File

@ -31,7 +31,7 @@ public class RulingCleaningService {
private static final float THRESHOLD_Y_HORIZONTAL = 3;
public CleanRulings getCleanRulings(List<TableCells> tableCells, List<Ruling> rulings) {
public CleanRulings deduplicateAndStraightenRulings(List<TableCells> tableCells, List<Ruling> rulings) {
Rulings verticalAndHorizontalRulingLines;

View File

@ -12,9 +12,9 @@ import lombok.experimental.UtilityClass;
@UtilityClass
public class TextRulingsClassifier {
private final static double STRIKETHROUGH_ZONE = 0.5; // multiplied with text height, determines height of intersection interval for strikethrough lines
private final static double UNDERLINE_ZONE = 0.2; // multiplied with text height, determines height of intersection interval of underline lines
private final static double TEXT_BBOX_THRESHOLD_FACTOR = 0.15; // multiplied with text width, subtracted from word width
private final static double STRIKETHROUGH_ZONE = 0.5; // multiplied with text height, determines height of intersection interval for strikethrough lines.
private final static double UNDERLINE_ZONE = 0.2; // multiplied with text height, determines height of intersection interval of underline lines.
private final static double TEXT_BBOX_THRESHOLD_FACTOR = 0.15; // multiplied with text width then subtracted from word width. If ruling covers this width, it is considered as strikethrough/underline.
public static void classifyUnderlinedAndStrikethroughText(List<TextPositionSequence> words, CleanRulings cleanRulings) {

View File

@ -48,7 +48,7 @@ public class DocstrumBlockificationService {
visualizations.addCharactersWithNeighbours(zones, textPositions.get(0).getPage());
}
var pageBlocks = toAbstractPageBlocks(zones, usedRulings.getHorizontals(), usedRulings.getVerticals(), xyOrder, usedRulings);
var pageBlocks = toAbstractPageBlocks(zones, xyOrder, usedRulings);
if (xyOrder) {
sortPageBlocksXThenY(pageBlocks);
@ -77,10 +77,7 @@ public class DocstrumBlockificationService {
}
private List<AbstractPageBlock> toAbstractPageBlocks(List<Zone> zones,
List<Ruling> horizontalRulings,
List<Ruling> verticalRulings,
boolean xyOrder,
private List<AbstractPageBlock> toAbstractPageBlocks(List<Zone> zones, boolean xyOrder,
CleanRulings usedRulings) {
List<AbstractPageBlock> abstractPageBlocks = new ArrayList<>();

View File

@ -5,7 +5,6 @@ import java.util.Locale;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import com.knecon.fforesight.service.layoutparser.processor.utils.MarkedContentUtils;
import org.springframework.stereotype.Service;
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
@ -13,6 +12,7 @@ import com.knecon.fforesight.service.layoutparser.processor.model.Classification
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
import com.knecon.fforesight.service.layoutparser.processor.model.PageBlockType;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.utils.MarkedContentUtils;
import com.knecon.fforesight.service.layoutparser.processor.utils.PositionUtils;
import lombok.RequiredArgsConstructor;
@ -49,7 +49,6 @@ public class DocuMineClassificationService {
}
}
private void classifyBlock(TextPageBlock textBlock, ClassificationPage page, ClassificationDocument document, List<Float> headlineFontSizes) {
log.debug("headlineFontSizes: {}", headlineFontSizes);

View File

@ -8,7 +8,6 @@ import java.util.Map;
import java.util.stream.Collectors;
import org.apache.pdfbox.cos.COSName;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.documentinterchange.markedcontent.PDMarkedContent;
import org.apache.pdfbox.text.TextPosition;
@ -24,7 +23,7 @@ public class MarkedContentUtils {
public static final String FOOTER = "Footer";
public List<Rectangle2D> getMarkedContentBboxPerLine(List<PDMarkedContent> markedContents, String subtype, PDPage pdPage) {
public List<Rectangle2D> getMarkedContentBboxPerLine(List<PDMarkedContent> markedContents, String subtype) {
if (markedContents == null) {
return Collections.emptyList();
@ -54,7 +53,7 @@ public class MarkedContentUtils {
}
public List<MarkedContentPosition> getMarkedContentPositions(List<PDMarkedContent> markedContents, PDPage pdPage) {
public List<MarkedContentPosition> getMarkedContentPositions(List<PDMarkedContent> markedContents) {
if (markedContents == null) {
return Collections.emptyList();
@ -62,7 +61,7 @@ public class MarkedContentUtils {
return markedContents.stream()
.filter(m -> !m.getContents().isEmpty())
.map(markedContent -> MarkedContentPosition.fromPDMarkedContent(markedContent, pdPage))
.map(MarkedContentPosition::fromPDMarkedContent)
.toList();
}
@ -77,20 +76,20 @@ public class MarkedContentUtils {
public record MarkedContentPosition(String type, String subType, List<Rectangle2D> textPositions) {
public static MarkedContentPosition fromPDMarkedContent(PDMarkedContent markedContent, PDPage pdPage) {
public static MarkedContentPosition fromPDMarkedContent(PDMarkedContent markedContent) {
return new MarkedContentPosition(markedContent.getTag(), parseSubType(markedContent), parseTextPositions(markedContent.getContents(), pdPage));
return new MarkedContentPosition(markedContent.getTag(), parseSubType(markedContent), parseTextPositions(markedContent.getContents()));
}
private static List<Rectangle2D> parseTextPositions(List<Object> contents, PDPage pdPage) {
private static List<Rectangle2D> parseTextPositions(List<Object> contents) {
return contents.stream()
.filter(content -> content instanceof TextPosition)
.map(content -> (TextPosition) content)
.filter(content -> !content.getUnicode().equals(" "))
.map(textPositions -> new TextPositionSequence(List.of(textPositions), 0, true).getRectangle())
.map(t -> new Rectangle2D.Float(t.getTopLeft().getX(), t.getTopLeft().getY() - Math.abs(t.getHeight()), t.getWidth(), Math.abs(t.getHeight())))
.map(textPositions -> new TextPositionSequence(List.of(textPositions), 0, true))
.map(TextPositionSequence::getBoundingBox)
.collect(Collectors.toList());
}

View File

@ -234,7 +234,7 @@ public class LayoutparsingVisualizations {
VisualizationsOnPage visualizationsOnPage = getOrCreateVisualizationsOnPage(pageNumber, markedContent);
List<MarkedContentUtils.MarkedContentPosition> markedContentBBoxMapBySubType = MarkedContentUtils.getMarkedContentPositions(markedContents, pdPage);
List<MarkedContentUtils.MarkedContentPosition> markedContentBBoxMapBySubType = MarkedContentUtils.getMarkedContentPositions(markedContents);
markedContentBBoxMapBySubType.forEach(markedContentPosition -> {

View File

@ -34,7 +34,7 @@ public class LayoutparserEnd2EndTest extends AbstractTest {
@Test
public void testLayoutParserEndToEnd() {
String filePath = "/home/kschuettler/Dokumente/TestFiles/tables with striketrough text.pdf";
String filePath = "/home/kschuettler/Dokumente/TestFiles/large number of prod files/101 S-Metolachlor_RAR_01_Volume_1_2018-09-06.pdf";
runForFile(filePath);
}

View File

@ -52,7 +52,7 @@ public class RulingCleaningServiceTest extends BuildDocumentTest {
RulingCleaningService rulingCleaningService = new RulingCleaningService();
List<List<Rectangle2D>> rectanglesPerPage = new LinkedList<>();
for (PageContents pageContent : pageContents) {
CleanRulings cleanRulings = rulingCleaningService.getCleanRulings(Collections.emptyList(), pageContent.getRulings());
CleanRulings cleanRulings = rulingCleaningService.deduplicateAndStraightenRulings(Collections.emptyList(), pageContent.getRulings());
List<Rectangle2D> rects = RectangularIntersectionFinder.find(cleanRulings.getHorizontals(), cleanRulings.getVerticals());
rectanglesPerPage.add(rects);
}
@ -72,7 +72,7 @@ public class RulingCleaningServiceTest extends BuildDocumentTest {
RulingCleaningService rulingCleaningService = new RulingCleaningService();
List<CleanRulings> cleanRulingsPerPage = new LinkedList<>();
for (PageContents pageContent : pageContents) {
cleanRulingsPerPage.add(rulingCleaningService.getCleanRulings(Collections.emptyList(), pageContent.getRulings()));
cleanRulingsPerPage.add(rulingCleaningService.deduplicateAndStraightenRulings(Collections.emptyList(), pageContent.getRulings()));
}
var cleanRulings = cleanRulingsPerPage.stream().map(CleanRulings::getVerticals).collect(Collectors.toList());
PdfDraw.drawLinesPerPage(fileName, cleanRulings, lineFileName);