RED-8825: general improvements

* some more refactoring
 * fixed text ruling classification for vertical text
 * shrunk min graphics size
This commit is contained in:
Kilian Schuettler 2024-04-30 10:44:32 +02:00
parent 08be18db2d
commit 15ea385f4d
9 changed files with 73 additions and 19 deletions

View File

@ -280,8 +280,9 @@ public class LayoutParsingPipeline {
redactManagerBlockificationService.blockify(stripper.getTextPositionSequences(), cleanRulings, classificationDocument.getVisualizations());
case DOCUMINE_OLD -> docuMineBlockificationService.blockify(words, cleanRulings);
case DOCUMINE, REDACT_MANAGER, REDACT_MANAGER_PARAGRAPH_DEBUG ->
docstrumBlockificationService.blockify(words, cleanRulings, true, classificationDocument.getVisualizations());
case CLARIFYND, CLARIFYND_PARAGRAPH_DEBUG -> docstrumBlockificationService.blockify(words, cleanRulings, false, classificationDocument.getVisualizations());
docstrumBlockificationService.blockify(words, cleanRulings, true, classificationDocument.getVisualizations(), layoutParsingType);
case CLARIFYND, CLARIFYND_PARAGRAPH_DEBUG ->
docstrumBlockificationService.blockify(words, cleanRulings, false, classificationDocument.getVisualizations(), layoutParsingType);
};
classificationPage.setCleanRulings(cleanRulings);
@ -311,12 +312,6 @@ public class LayoutParsingPipeline {
tableExtractionService.extractTables(emptyTableCells, classificationPage);
if (layoutParsingType == LayoutParsingType.REDACT_MANAGER) {
docstrumBlockificationService.combineBlocks(classificationPage);
} else if (layoutParsingType == LayoutParsingType.CLARIFYND) {
docstrumBlockificationService.mergeIntersectingBlocks(classificationPage, 0, 6.5f);
}
buildPageStatistics(classificationPage);
increaseDocumentStatistics(classificationPage, classificationDocument);

View File

@ -37,7 +37,7 @@ public class TextRulingsClassifier {
float strikethroughCenterX = (float) word.getBoundingBox().getCenterX();
float strikethroughBoxHeight = (float) ((word.getHeight() * STRIKETHROUGH_ZONE) / 2);
float underlineCenterX = (float) (word.getDir().equals(TextDirection.QUARTER_CIRCLE) ? word.getBoundingBox().getMinX() : word.getBoundingBox().getMaxX());
float underlineCenterX = (float) (word.getDir().equals(TextDirection.QUARTER_CIRCLE) ? word.getBoundingBox().getMaxX() : word.getBoundingBox().getMinX());
float underlineBoxHeight = (float) ((word.getHeight() * UNDERLINE_ZONE) / 2);
float leftX = Math.min(underlineCenterX - underlineBoxHeight, strikethroughCenterX - strikethroughBoxHeight);

View File

@ -9,6 +9,7 @@ import java.util.ListIterator;
import org.springframework.stereotype.Service;
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType;
import com.knecon.fforesight.service.layoutparser.processor.docstrum.DocstrumSegmentationService;
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.Zone;
import com.knecon.fforesight.service.layoutparser.processor.docstrum.utils.DoubleUtils;
@ -36,7 +37,11 @@ public class DocstrumBlockificationService {
static final float THRESHOLD = 1f;
public ClassificationPage blockify(List<TextPositionSequence> textPositions, CleanRulings rulings, boolean xyOrder, LayoutparsingVisualizations visualizations) {
public ClassificationPage blockify(List<TextPositionSequence> textPositions,
CleanRulings rulings,
boolean xyOrder,
LayoutparsingVisualizations visualizations,
LayoutParsingType layoutParsingType) {
CleanRulings usedRulings = rulings.withoutTextRulings();
@ -59,6 +64,12 @@ public class DocstrumBlockificationService {
mergeIntersectingBlocks(classificationPage, 0, 0);
combineBlocks(classificationPage);
if (layoutParsingType == LayoutParsingType.CLARIFYND) {
mergeIntersectingBlocks(classificationPage, 0, 6.5f);
}
return classificationPage;
}
@ -77,8 +88,7 @@ public class DocstrumBlockificationService {
}
private List<AbstractPageBlock> toAbstractPageBlocks(List<Zone> zones, boolean xyOrder,
CleanRulings usedRulings) {
private List<AbstractPageBlock> toAbstractPageBlocks(List<Zone> zones, boolean xyOrder, CleanRulings usedRulings) {
List<AbstractPageBlock> abstractPageBlocks = new ArrayList<>();
zones.forEach(zone -> {
@ -103,6 +113,7 @@ public class DocstrumBlockificationService {
TextPageBlock previous = new TextPageBlock();
ListIterator<AbstractPageBlock> itty = page.getTextBlocks().listIterator();
CleanRulings usedRulings = page.getCleanRulings().withoutTextRulings();
while (itty.hasNext()) {
AbstractPageBlock block = itty.next();
@ -114,7 +125,7 @@ public class DocstrumBlockificationService {
if (previous != null && !previous.getSequences().isEmpty()) {
if (current.getDir() != previous.getDir()) {
if (current.getDir() != previous.getDir() || usedRulings.lineBetween(current.getBBox(), previous.getBBox())) {
previous = current;
continue;
}

View File

@ -21,6 +21,9 @@ import lombok.SneakyThrows;
@RequiredArgsConstructor
public class GraphicExtractorService {
private static final int MIN_GRAPHICS_SIDE_LENGTH = 30;
private static final int MIN_GRAPHICS_AREA = 500;
private final GraphicsClusteringService graphicsClusteringService;
private final FindGraphicsRaster findGraphicsRaster;
@ -55,7 +58,7 @@ public class GraphicExtractorService {
List<Box> clusters = graphicsClusteringService.getClusters(filteredGraphicBBoxes, 14);
return clusters.stream()
.filter(box -> box.area() > 500 && box.height() > 50 && box.width() > 50)
.filter(box -> box.area() > MIN_GRAPHICS_AREA && box.height() > MIN_GRAPHICS_SIDE_LENGTH && box.width() > MIN_GRAPHICS_SIDE_LENGTH)
.toList();
}

View File

@ -127,8 +127,8 @@ public class LayoutparsingVisualizations {
}
VisualizationsOnPage visualizationsOnPage = getOrCreateVisualizationsOnPage(pageNumber, this.rulings);
visualizationsOnPage.getColoredLines()
.addAll(Stream.of(cleanRulings.getHorizontals(), cleanRulings.getVerticals())
.flatMap(Collection::stream)
.addAll(cleanRulings.buildAll()
.stream()
.map(ruling -> new ColoredLine(ruling, decideOnRulingColor(ruling), 1))
.toList());
}

View File

@ -34,7 +34,7 @@ public class LayoutparserEnd2EndTest extends AbstractTest {
@Test
public void testLayoutParserEndToEnd() {
String filePath = "/home/kschuettler/Dokumente/TestFiles/large number of prod files/101 S-Metolachlor_RAR_01_Volume_1_2018-09-06.pdf";
String filePath = "/home/kschuettler/Dokumente/TestFiles/RotateTextWithRulingsTestFile.pdf";
runForFile(filePath);
}

View File

@ -0,0 +1,46 @@
package com.knecon.fforesight.service.layoutparser.server.services;
import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertTrue;
import java.util.Collections;
import java.util.List;
import org.junit.jupiter.api.Test;
import com.knecon.fforesight.service.layoutparser.processor.model.PageContents;
import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings;
import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
import com.knecon.fforesight.service.layoutparser.processor.services.PageContentExtractor;
import com.knecon.fforesight.service.layoutparser.processor.services.RulingCleaningService;
import com.knecon.fforesight.service.layoutparser.processor.services.TextRulingsClassifier;
import com.knecon.fforesight.service.layoutparser.processor.utils.RectangularIntersectionFinder;
import lombok.SneakyThrows;
public class TextRulingsClassifierTest {
@Test
@SneakyThrows
public void textRulingExtractionTest() {
String fileName = "files/Minimal Examples/RotateTextWithRulingsTestFile.pdf";
List<PageContents> pageContents = PageContentExtractor.getSortedPageContents(fileName);
RulingCleaningService rulingCleaningService = new RulingCleaningService();
for (PageContents pageContent : pageContents) {
CleanRulings cleanRulings = rulingCleaningService.deduplicateAndStraightenRulings(Collections.emptyList(), pageContent.getRulings());
RectangularIntersectionFinder.find(cleanRulings.getHorizontals(), cleanRulings.getVerticals());
TextRulingsClassifier.classifyUnderlinedAndStrikethroughText(pageContent.getSortedTextPositionSequences(), cleanRulings);
assertTrue(pageContent.getSortedTextPositionSequences().stream().filter(word -> word.toString().equals("Underlined")).allMatch(TextPositionSequence::isUnderline));
assertTrue(pageContent.getSortedTextPositionSequences().stream().filter(word -> word.toString().equals("Striketrough")).allMatch(TextPositionSequence::isStrikethrough));
assertEquals(4, cleanRulings.buildAll().stream().filter(ruling -> ruling.getClassification().equals(Ruling.Classification.STRIKETROUGH)).count());
assertEquals(4, cleanRulings.buildAll().stream().filter(ruling -> ruling.getClassification().equals(Ruling.Classification.UNDERLINE)).count());
}
}
}

View File

@ -185,8 +185,7 @@ public class ViewerDocumentService {
contentStream.setFont(font, placedText.fontSize());
contentStream.beginText();
contentStream.setNonStrokingColor(placedText.color());
if (placedText.renderingMode()
.isPresent()) {
if (placedText.renderingMode().isPresent()) {
contentStream.setRenderingMode(placedText.renderingMode().get());
} else {
contentStream.setRenderingMode(RenderingMode.FILL);