RED-8825: general improvements
* some more refactoring
This commit is contained in:
parent
64209255cb
commit
08be18db2d
@ -261,7 +261,7 @@ public class LayoutParsingPipeline {
|
||||
boolean isLandscape = pdr.getWidth() > pdr.getHeight() && (rotation == 0 || rotation == 180) || pdr.getHeight() > pdr.getWidth() && (rotation == 90 || rotation == 270);
|
||||
|
||||
PDRectangle cropbox = pdPage.getCropBox();
|
||||
CleanRulings cleanRulings = rulingCleaningService.getCleanRulings(pdfTableCells.get(pageNumber), stripper.getRulings());
|
||||
CleanRulings cleanRulings = rulingCleaningService.deduplicateAndStraightenRulings(pdfTableCells.get(pageNumber), stripper.getRulings());
|
||||
|
||||
List<Cell> emptyTableCells = TableExtractionService.findCells(cleanRulings.getHorizontals(), cleanRulings.getVerticals());
|
||||
classificationDocument.getVisualizations().addCellVisualizations(emptyTableCells, pageNumber);
|
||||
@ -293,10 +293,10 @@ public class LayoutParsingPipeline {
|
||||
|
||||
classificationDocument.getVisualizations().addMarkedContentVisualizations(stripper.getMarkedContents(), pageNumber, pdPage);
|
||||
// MarkedContent needs to be converted at this point, otherwise it leads to GC Problems in Pdfbox.
|
||||
classificationPage.setMarkedContentBboxPerType(convertMarkedContents(stripper.getMarkedContents(), pdPage));
|
||||
classificationPage.setMarkedContentBboxPerType(convertMarkedContents(stripper.getMarkedContents()));
|
||||
|
||||
// If images is ocr needs to be calculated before textBlocks are moved into tables, otherwise findOcr algorithm needs to be adopted.
|
||||
if (pdfImages != null && pdfImages.containsKey(pageNumber)) {
|
||||
if (pdfImages.containsKey(pageNumber)) {
|
||||
classificationPage.setImages(pdfImages.get(pageNumber));
|
||||
imageServiceResponseAdapter.findOcr(classificationPage);
|
||||
}
|
||||
@ -370,11 +370,11 @@ public class LayoutParsingPipeline {
|
||||
}
|
||||
|
||||
|
||||
private Map<String, List<Rectangle2D>> convertMarkedContents(List<PDMarkedContent> pdMarkedContents, PDPage pdPage) {
|
||||
private Map<String, List<Rectangle2D>> convertMarkedContents(List<PDMarkedContent> pdMarkedContents) {
|
||||
|
||||
Map<String, List<Rectangle2D>> markedContentBboxes = new HashMap<>();
|
||||
markedContentBboxes.put(MarkedContentUtils.HEADER, MarkedContentUtils.getMarkedContentBboxPerLine(pdMarkedContents, MarkedContentUtils.HEADER, pdPage));
|
||||
markedContentBboxes.put(MarkedContentUtils.FOOTER, MarkedContentUtils.getMarkedContentBboxPerLine(pdMarkedContents, MarkedContentUtils.FOOTER, pdPage));
|
||||
markedContentBboxes.put(MarkedContentUtils.HEADER, MarkedContentUtils.getMarkedContentBboxPerLine(pdMarkedContents, MarkedContentUtils.HEADER));
|
||||
markedContentBboxes.put(MarkedContentUtils.FOOTER, MarkedContentUtils.getMarkedContentBboxPerLine(pdMarkedContents, MarkedContentUtils.FOOTER));
|
||||
return markedContentBboxes;
|
||||
}
|
||||
|
||||
|
||||
@ -9,6 +9,7 @@ import java.util.Map;
|
||||
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.ImageType;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.image.ClassifiedImage;
|
||||
@ -54,11 +55,12 @@ public class ImageServiceResponseAdapter {
|
||||
|
||||
classificationPage.getImages().forEach(image -> {
|
||||
if (image.getImageType().equals(ImageType.OTHER)) {
|
||||
classificationPage.getTextBlocks().forEach(textblock -> {
|
||||
if (image.getPosition().contains(textblock.getMinX(), textblock.getMinY(), textblock.getWidth(), textblock.getHeight())) {
|
||||
for (AbstractPageBlock textblock : classificationPage.getTextBlocks()) {
|
||||
if (image.getPosition().contains(textblock.getBBox())) {
|
||||
image.setImageType(ImageType.OCR);
|
||||
return;
|
||||
}
|
||||
});
|
||||
}
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
@ -31,7 +31,7 @@ public class RulingCleaningService {
|
||||
private static final float THRESHOLD_Y_HORIZONTAL = 3;
|
||||
|
||||
|
||||
public CleanRulings getCleanRulings(List<TableCells> tableCells, List<Ruling> rulings) {
|
||||
public CleanRulings deduplicateAndStraightenRulings(List<TableCells> tableCells, List<Ruling> rulings) {
|
||||
|
||||
Rulings verticalAndHorizontalRulingLines;
|
||||
|
||||
|
||||
@ -12,9 +12,9 @@ import lombok.experimental.UtilityClass;
|
||||
@UtilityClass
|
||||
public class TextRulingsClassifier {
|
||||
|
||||
private final static double STRIKETHROUGH_ZONE = 0.5; // multiplied with text height, determines height of intersection interval for strikethrough lines
|
||||
private final static double UNDERLINE_ZONE = 0.2; // multiplied with text height, determines height of intersection interval of underline lines
|
||||
private final static double TEXT_BBOX_THRESHOLD_FACTOR = 0.15; // multiplied with text width, subtracted from word width
|
||||
private final static double STRIKETHROUGH_ZONE = 0.5; // multiplied with text height, determines height of intersection interval for strikethrough lines.
|
||||
private final static double UNDERLINE_ZONE = 0.2; // multiplied with text height, determines height of intersection interval of underline lines.
|
||||
private final static double TEXT_BBOX_THRESHOLD_FACTOR = 0.15; // multiplied with text width then subtracted from word width. If ruling covers this width, it is considered as strikethrough/underline.
|
||||
|
||||
|
||||
public static void classifyUnderlinedAndStrikethroughText(List<TextPositionSequence> words, CleanRulings cleanRulings) {
|
||||
|
||||
@ -48,7 +48,7 @@ public class DocstrumBlockificationService {
|
||||
visualizations.addCharactersWithNeighbours(zones, textPositions.get(0).getPage());
|
||||
}
|
||||
|
||||
var pageBlocks = toAbstractPageBlocks(zones, usedRulings.getHorizontals(), usedRulings.getVerticals(), xyOrder, usedRulings);
|
||||
var pageBlocks = toAbstractPageBlocks(zones, xyOrder, usedRulings);
|
||||
|
||||
if (xyOrder) {
|
||||
sortPageBlocksXThenY(pageBlocks);
|
||||
@ -77,10 +77,7 @@ public class DocstrumBlockificationService {
|
||||
}
|
||||
|
||||
|
||||
private List<AbstractPageBlock> toAbstractPageBlocks(List<Zone> zones,
|
||||
List<Ruling> horizontalRulings,
|
||||
List<Ruling> verticalRulings,
|
||||
boolean xyOrder,
|
||||
private List<AbstractPageBlock> toAbstractPageBlocks(List<Zone> zones, boolean xyOrder,
|
||||
CleanRulings usedRulings) {
|
||||
|
||||
List<AbstractPageBlock> abstractPageBlocks = new ArrayList<>();
|
||||
|
||||
@ -5,7 +5,6 @@ import java.util.Locale;
|
||||
import java.util.regex.Matcher;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.MarkedContentUtils;
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
|
||||
@ -13,6 +12,7 @@ import com.knecon.fforesight.service.layoutparser.processor.model.Classification
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.PageBlockType;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.MarkedContentUtils;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.PositionUtils;
|
||||
|
||||
import lombok.RequiredArgsConstructor;
|
||||
@ -49,7 +49,6 @@ public class DocuMineClassificationService {
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
private void classifyBlock(TextPageBlock textBlock, ClassificationPage page, ClassificationDocument document, List<Float> headlineFontSizes) {
|
||||
|
||||
log.debug("headlineFontSizes: {}", headlineFontSizes);
|
||||
|
||||
@ -8,7 +8,6 @@ import java.util.Map;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import org.apache.pdfbox.cos.COSName;
|
||||
import org.apache.pdfbox.pdmodel.PDPage;
|
||||
import org.apache.pdfbox.pdmodel.documentinterchange.markedcontent.PDMarkedContent;
|
||||
import org.apache.pdfbox.text.TextPosition;
|
||||
|
||||
@ -24,7 +23,7 @@ public class MarkedContentUtils {
|
||||
public static final String FOOTER = "Footer";
|
||||
|
||||
|
||||
public List<Rectangle2D> getMarkedContentBboxPerLine(List<PDMarkedContent> markedContents, String subtype, PDPage pdPage) {
|
||||
public List<Rectangle2D> getMarkedContentBboxPerLine(List<PDMarkedContent> markedContents, String subtype) {
|
||||
|
||||
if (markedContents == null) {
|
||||
return Collections.emptyList();
|
||||
@ -54,7 +53,7 @@ public class MarkedContentUtils {
|
||||
}
|
||||
|
||||
|
||||
public List<MarkedContentPosition> getMarkedContentPositions(List<PDMarkedContent> markedContents, PDPage pdPage) {
|
||||
public List<MarkedContentPosition> getMarkedContentPositions(List<PDMarkedContent> markedContents) {
|
||||
|
||||
if (markedContents == null) {
|
||||
return Collections.emptyList();
|
||||
@ -62,7 +61,7 @@ public class MarkedContentUtils {
|
||||
|
||||
return markedContents.stream()
|
||||
.filter(m -> !m.getContents().isEmpty())
|
||||
.map(markedContent -> MarkedContentPosition.fromPDMarkedContent(markedContent, pdPage))
|
||||
.map(MarkedContentPosition::fromPDMarkedContent)
|
||||
.toList();
|
||||
}
|
||||
|
||||
@ -77,20 +76,20 @@ public class MarkedContentUtils {
|
||||
|
||||
public record MarkedContentPosition(String type, String subType, List<Rectangle2D> textPositions) {
|
||||
|
||||
public static MarkedContentPosition fromPDMarkedContent(PDMarkedContent markedContent, PDPage pdPage) {
|
||||
public static MarkedContentPosition fromPDMarkedContent(PDMarkedContent markedContent) {
|
||||
|
||||
return new MarkedContentPosition(markedContent.getTag(), parseSubType(markedContent), parseTextPositions(markedContent.getContents(), pdPage));
|
||||
return new MarkedContentPosition(markedContent.getTag(), parseSubType(markedContent), parseTextPositions(markedContent.getContents()));
|
||||
}
|
||||
|
||||
|
||||
private static List<Rectangle2D> parseTextPositions(List<Object> contents, PDPage pdPage) {
|
||||
private static List<Rectangle2D> parseTextPositions(List<Object> contents) {
|
||||
|
||||
return contents.stream()
|
||||
.filter(content -> content instanceof TextPosition)
|
||||
.map(content -> (TextPosition) content)
|
||||
.filter(content -> !content.getUnicode().equals(" "))
|
||||
.map(textPositions -> new TextPositionSequence(List.of(textPositions), 0, true).getRectangle())
|
||||
.map(t -> new Rectangle2D.Float(t.getTopLeft().getX(), t.getTopLeft().getY() - Math.abs(t.getHeight()), t.getWidth(), Math.abs(t.getHeight())))
|
||||
.map(textPositions -> new TextPositionSequence(List.of(textPositions), 0, true))
|
||||
.map(TextPositionSequence::getBoundingBox)
|
||||
.collect(Collectors.toList());
|
||||
}
|
||||
|
||||
|
||||
@ -234,7 +234,7 @@ public class LayoutparsingVisualizations {
|
||||
|
||||
VisualizationsOnPage visualizationsOnPage = getOrCreateVisualizationsOnPage(pageNumber, markedContent);
|
||||
|
||||
List<MarkedContentUtils.MarkedContentPosition> markedContentBBoxMapBySubType = MarkedContentUtils.getMarkedContentPositions(markedContents, pdPage);
|
||||
List<MarkedContentUtils.MarkedContentPosition> markedContentBBoxMapBySubType = MarkedContentUtils.getMarkedContentPositions(markedContents);
|
||||
|
||||
markedContentBBoxMapBySubType.forEach(markedContentPosition -> {
|
||||
|
||||
|
||||
@ -34,7 +34,7 @@ public class LayoutparserEnd2EndTest extends AbstractTest {
|
||||
@Test
|
||||
public void testLayoutParserEndToEnd() {
|
||||
|
||||
String filePath = "/home/kschuettler/Dokumente/TestFiles/tables with striketrough text.pdf";
|
||||
String filePath = "/home/kschuettler/Dokumente/TestFiles/large number of prod files/101 S-Metolachlor_RAR_01_Volume_1_2018-09-06.pdf";
|
||||
|
||||
runForFile(filePath);
|
||||
}
|
||||
|
||||
@ -52,7 +52,7 @@ public class RulingCleaningServiceTest extends BuildDocumentTest {
|
||||
RulingCleaningService rulingCleaningService = new RulingCleaningService();
|
||||
List<List<Rectangle2D>> rectanglesPerPage = new LinkedList<>();
|
||||
for (PageContents pageContent : pageContents) {
|
||||
CleanRulings cleanRulings = rulingCleaningService.getCleanRulings(Collections.emptyList(), pageContent.getRulings());
|
||||
CleanRulings cleanRulings = rulingCleaningService.deduplicateAndStraightenRulings(Collections.emptyList(), pageContent.getRulings());
|
||||
List<Rectangle2D> rects = RectangularIntersectionFinder.find(cleanRulings.getHorizontals(), cleanRulings.getVerticals());
|
||||
rectanglesPerPage.add(rects);
|
||||
}
|
||||
@ -72,7 +72,7 @@ public class RulingCleaningServiceTest extends BuildDocumentTest {
|
||||
RulingCleaningService rulingCleaningService = new RulingCleaningService();
|
||||
List<CleanRulings> cleanRulingsPerPage = new LinkedList<>();
|
||||
for (PageContents pageContent : pageContents) {
|
||||
cleanRulingsPerPage.add(rulingCleaningService.getCleanRulings(Collections.emptyList(), pageContent.getRulings()));
|
||||
cleanRulingsPerPage.add(rulingCleaningService.deduplicateAndStraightenRulings(Collections.emptyList(), pageContent.getRulings()));
|
||||
}
|
||||
var cleanRulings = cleanRulingsPerPage.stream().map(CleanRulings::getVerticals).collect(Collectors.toList());
|
||||
PdfDraw.drawLinesPerPage(fileName, cleanRulings, lineFileName);
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user