RED-8825: general improvements
* some more refactoring * fixed text ruling classification for vertical text * shrunk min graphics size
This commit is contained in:
parent
08be18db2d
commit
15ea385f4d
@ -280,8 +280,9 @@ public class LayoutParsingPipeline {
|
||||
redactManagerBlockificationService.blockify(stripper.getTextPositionSequences(), cleanRulings, classificationDocument.getVisualizations());
|
||||
case DOCUMINE_OLD -> docuMineBlockificationService.blockify(words, cleanRulings);
|
||||
case DOCUMINE, REDACT_MANAGER, REDACT_MANAGER_PARAGRAPH_DEBUG ->
|
||||
docstrumBlockificationService.blockify(words, cleanRulings, true, classificationDocument.getVisualizations());
|
||||
case CLARIFYND, CLARIFYND_PARAGRAPH_DEBUG -> docstrumBlockificationService.blockify(words, cleanRulings, false, classificationDocument.getVisualizations());
|
||||
docstrumBlockificationService.blockify(words, cleanRulings, true, classificationDocument.getVisualizations(), layoutParsingType);
|
||||
case CLARIFYND, CLARIFYND_PARAGRAPH_DEBUG ->
|
||||
docstrumBlockificationService.blockify(words, cleanRulings, false, classificationDocument.getVisualizations(), layoutParsingType);
|
||||
};
|
||||
|
||||
classificationPage.setCleanRulings(cleanRulings);
|
||||
@ -311,12 +312,6 @@ public class LayoutParsingPipeline {
|
||||
|
||||
tableExtractionService.extractTables(emptyTableCells, classificationPage);
|
||||
|
||||
if (layoutParsingType == LayoutParsingType.REDACT_MANAGER) {
|
||||
docstrumBlockificationService.combineBlocks(classificationPage);
|
||||
} else if (layoutParsingType == LayoutParsingType.CLARIFYND) {
|
||||
docstrumBlockificationService.mergeIntersectingBlocks(classificationPage, 0, 6.5f);
|
||||
}
|
||||
|
||||
buildPageStatistics(classificationPage);
|
||||
increaseDocumentStatistics(classificationPage, classificationDocument);
|
||||
|
||||
|
||||
@ -37,7 +37,7 @@ public class TextRulingsClassifier {
|
||||
float strikethroughCenterX = (float) word.getBoundingBox().getCenterX();
|
||||
float strikethroughBoxHeight = (float) ((word.getHeight() * STRIKETHROUGH_ZONE) / 2);
|
||||
|
||||
float underlineCenterX = (float) (word.getDir().equals(TextDirection.QUARTER_CIRCLE) ? word.getBoundingBox().getMinX() : word.getBoundingBox().getMaxX());
|
||||
float underlineCenterX = (float) (word.getDir().equals(TextDirection.QUARTER_CIRCLE) ? word.getBoundingBox().getMaxX() : word.getBoundingBox().getMinX());
|
||||
float underlineBoxHeight = (float) ((word.getHeight() * UNDERLINE_ZONE) / 2);
|
||||
|
||||
float leftX = Math.min(underlineCenterX - underlineBoxHeight, strikethroughCenterX - strikethroughBoxHeight);
|
||||
|
||||
@ -9,6 +9,7 @@ import java.util.ListIterator;
|
||||
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.docstrum.DocstrumSegmentationService;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.Zone;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.docstrum.utils.DoubleUtils;
|
||||
@ -36,7 +37,11 @@ public class DocstrumBlockificationService {
|
||||
static final float THRESHOLD = 1f;
|
||||
|
||||
|
||||
public ClassificationPage blockify(List<TextPositionSequence> textPositions, CleanRulings rulings, boolean xyOrder, LayoutparsingVisualizations visualizations) {
|
||||
public ClassificationPage blockify(List<TextPositionSequence> textPositions,
|
||||
CleanRulings rulings,
|
||||
boolean xyOrder,
|
||||
LayoutparsingVisualizations visualizations,
|
||||
LayoutParsingType layoutParsingType) {
|
||||
|
||||
CleanRulings usedRulings = rulings.withoutTextRulings();
|
||||
|
||||
@ -59,6 +64,12 @@ public class DocstrumBlockificationService {
|
||||
|
||||
mergeIntersectingBlocks(classificationPage, 0, 0);
|
||||
|
||||
combineBlocks(classificationPage);
|
||||
|
||||
if (layoutParsingType == LayoutParsingType.CLARIFYND) {
|
||||
mergeIntersectingBlocks(classificationPage, 0, 6.5f);
|
||||
}
|
||||
|
||||
return classificationPage;
|
||||
}
|
||||
|
||||
@ -77,8 +88,7 @@ public class DocstrumBlockificationService {
|
||||
}
|
||||
|
||||
|
||||
private List<AbstractPageBlock> toAbstractPageBlocks(List<Zone> zones, boolean xyOrder,
|
||||
CleanRulings usedRulings) {
|
||||
private List<AbstractPageBlock> toAbstractPageBlocks(List<Zone> zones, boolean xyOrder, CleanRulings usedRulings) {
|
||||
|
||||
List<AbstractPageBlock> abstractPageBlocks = new ArrayList<>();
|
||||
zones.forEach(zone -> {
|
||||
@ -103,6 +113,7 @@ public class DocstrumBlockificationService {
|
||||
|
||||
TextPageBlock previous = new TextPageBlock();
|
||||
ListIterator<AbstractPageBlock> itty = page.getTextBlocks().listIterator();
|
||||
CleanRulings usedRulings = page.getCleanRulings().withoutTextRulings();
|
||||
while (itty.hasNext()) {
|
||||
|
||||
AbstractPageBlock block = itty.next();
|
||||
@ -114,7 +125,7 @@ public class DocstrumBlockificationService {
|
||||
|
||||
if (previous != null && !previous.getSequences().isEmpty()) {
|
||||
|
||||
if (current.getDir() != previous.getDir()) {
|
||||
if (current.getDir() != previous.getDir() || usedRulings.lineBetween(current.getBBox(), previous.getBBox())) {
|
||||
previous = current;
|
||||
continue;
|
||||
}
|
||||
|
||||
@ -21,6 +21,9 @@ import lombok.SneakyThrows;
|
||||
@RequiredArgsConstructor
|
||||
public class GraphicExtractorService {
|
||||
|
||||
private static final int MIN_GRAPHICS_SIDE_LENGTH = 30;
|
||||
private static final int MIN_GRAPHICS_AREA = 500;
|
||||
|
||||
private final GraphicsClusteringService graphicsClusteringService;
|
||||
private final FindGraphicsRaster findGraphicsRaster;
|
||||
|
||||
@ -55,7 +58,7 @@ public class GraphicExtractorService {
|
||||
List<Box> clusters = graphicsClusteringService.getClusters(filteredGraphicBBoxes, 14);
|
||||
|
||||
return clusters.stream()
|
||||
.filter(box -> box.area() > 500 && box.height() > 50 && box.width() > 50)
|
||||
.filter(box -> box.area() > MIN_GRAPHICS_AREA && box.height() > MIN_GRAPHICS_SIDE_LENGTH && box.width() > MIN_GRAPHICS_SIDE_LENGTH)
|
||||
.toList();
|
||||
}
|
||||
|
||||
|
||||
@ -127,8 +127,8 @@ public class LayoutparsingVisualizations {
|
||||
}
|
||||
VisualizationsOnPage visualizationsOnPage = getOrCreateVisualizationsOnPage(pageNumber, this.rulings);
|
||||
visualizationsOnPage.getColoredLines()
|
||||
.addAll(Stream.of(cleanRulings.getHorizontals(), cleanRulings.getVerticals())
|
||||
.flatMap(Collection::stream)
|
||||
.addAll(cleanRulings.buildAll()
|
||||
.stream()
|
||||
.map(ruling -> new ColoredLine(ruling, decideOnRulingColor(ruling), 1))
|
||||
.toList());
|
||||
}
|
||||
|
||||
@ -34,7 +34,7 @@ public class LayoutparserEnd2EndTest extends AbstractTest {
|
||||
@Test
|
||||
public void testLayoutParserEndToEnd() {
|
||||
|
||||
String filePath = "/home/kschuettler/Dokumente/TestFiles/large number of prod files/101 S-Metolachlor_RAR_01_Volume_1_2018-09-06.pdf";
|
||||
String filePath = "/home/kschuettler/Dokumente/TestFiles/RotateTextWithRulingsTestFile.pdf";
|
||||
|
||||
runForFile(filePath);
|
||||
}
|
||||
|
||||
@ -0,0 +1,46 @@
|
||||
package com.knecon.fforesight.service.layoutparser.server.services;
|
||||
|
||||
import static org.junit.jupiter.api.Assertions.assertEquals;
|
||||
import static org.junit.jupiter.api.Assertions.assertTrue;
|
||||
|
||||
import java.util.Collections;
|
||||
import java.util.List;
|
||||
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.PageContents;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.PageContentExtractor;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.RulingCleaningService;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.TextRulingsClassifier;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.RectangularIntersectionFinder;
|
||||
|
||||
import lombok.SneakyThrows;
|
||||
|
||||
public class TextRulingsClassifierTest {
|
||||
|
||||
@Test
|
||||
@SneakyThrows
|
||||
public void textRulingExtractionTest() {
|
||||
|
||||
String fileName = "files/Minimal Examples/RotateTextWithRulingsTestFile.pdf";
|
||||
List<PageContents> pageContents = PageContentExtractor.getSortedPageContents(fileName);
|
||||
RulingCleaningService rulingCleaningService = new RulingCleaningService();
|
||||
|
||||
for (PageContents pageContent : pageContents) {
|
||||
CleanRulings cleanRulings = rulingCleaningService.deduplicateAndStraightenRulings(Collections.emptyList(), pageContent.getRulings());
|
||||
RectangularIntersectionFinder.find(cleanRulings.getHorizontals(), cleanRulings.getVerticals());
|
||||
TextRulingsClassifier.classifyUnderlinedAndStrikethroughText(pageContent.getSortedTextPositionSequences(), cleanRulings);
|
||||
|
||||
assertTrue(pageContent.getSortedTextPositionSequences().stream().filter(word -> word.toString().equals("Underlined")).allMatch(TextPositionSequence::isUnderline));
|
||||
assertTrue(pageContent.getSortedTextPositionSequences().stream().filter(word -> word.toString().equals("Striketrough")).allMatch(TextPositionSequence::isStrikethrough));
|
||||
|
||||
assertEquals(4, cleanRulings.buildAll().stream().filter(ruling -> ruling.getClassification().equals(Ruling.Classification.STRIKETROUGH)).count());
|
||||
assertEquals(4, cleanRulings.buildAll().stream().filter(ruling -> ruling.getClassification().equals(Ruling.Classification.UNDERLINE)).count());
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
Binary file not shown.
@ -185,8 +185,7 @@ public class ViewerDocumentService {
|
||||
contentStream.setFont(font, placedText.fontSize());
|
||||
contentStream.beginText();
|
||||
contentStream.setNonStrokingColor(placedText.color());
|
||||
if (placedText.renderingMode()
|
||||
.isPresent()) {
|
||||
if (placedText.renderingMode().isPresent()) {
|
||||
contentStream.setRenderingMode(placedText.renderingMode().get());
|
||||
} else {
|
||||
contentStream.setRenderingMode(RenderingMode.FILL);
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user