RED-7141: Fixed problem with different text directions

This commit is contained in:
Dominique Eifländer 2024-02-26 13:08:52 +01:00
parent 385d4b399e
commit 32c877e8f7
14 changed files with 172 additions and 17 deletions

View File

@ -316,7 +316,7 @@ public class LayoutParsingPipeline {
log.info("Building Sections for {}", identifier);
if (layoutParsingType == DOCSTRUM || layoutParsingType == DOCSTRUM_XY) {
// Currently for debugging return paragraphs as sections, becaus there is a merging logic in sectionBuilder
// Currently for debugging return paragraphs as sections, because there is a merging logic in sectionBuilder
List<ClassificationSection> sections = new ArrayList<>();
for (var page : classificationPages) {
page.getTextBlocks().forEach(block -> {

View File

@ -102,6 +102,12 @@ public abstract class AbstractPageBlock {
}
public boolean intersects(AbstractPageBlock apb) {
return this.minY < apb.getMaxY() && this.maxY >= apb.getMinY() && this.minX < apb.getMaxX() && this.maxX > apb.getMinX();
}
public abstract boolean isEmpty();
}

View File

@ -45,6 +45,9 @@ public class RedTextPosition {
@JsonIgnore
private String fontName;
@JsonIgnore
TextPosition textPosition;
@SneakyThrows
public static RedTextPosition fromTextPosition(TextPosition textPosition) {
@ -63,6 +66,7 @@ public class RedTextPosition {
position[3] = textPosition.getHeightDir();
pos.setPosition(position);
pos.textPosition = textPosition;
return pos;
}

View File

@ -19,6 +19,7 @@ import com.knecon.fforesight.service.layoutparser.processor.model.text.StringFre
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.DocstrumSegmentationService;
import com.knecon.fforesight.service.layoutparser.processor.utils.QuickSort;
import com.knecon.fforesight.service.layoutparser.processor.utils.RulingTextDirAdjustUtil;
import com.knecon.fforesight.service.layoutparser.processor.utils.TextPositionSequenceComparator;
@ -48,6 +49,7 @@ public class DocstrumBlockificationService {
});
abstractPageBlocks.addAll(splitZonesAtRulings(textPositionSequences, horizontalRulingLines, verticalRulingLines));
// abstractPageBlocks.add(buildTextBlock(textPositionSequences, 0));
});
return new ClassificationPage(abstractPageBlocks);
@ -56,6 +58,8 @@ public class DocstrumBlockificationService {
public void combineBlocks(ClassificationPage page) {
mergeZones(page.getTextBlocks());
TextPageBlock previous = new TextPageBlock();
ListIterator<AbstractPageBlock> itty = page.getTextBlocks().listIterator();
while (itty.hasNext()) {
@ -65,9 +69,9 @@ public class DocstrumBlockificationService {
}
TextPageBlock current = (TextPageBlock) block;
if (previous != null) {
if (previous != null && !previous.getSequences().isEmpty()) {
if ((Math.abs(previous.getMaxY() - current.getMaxY()) < THRESHOLD || Math.abs(previous.getMinY() - current.getMinY()) < THRESHOLD) && (previous.getNumberOfLines() >= 2 && current.getNumberOfLines() >= 2 /* && current.getNumberOfLines() <= 10 */ && previous.getNumberOfLines() <= current.getNumberOfLines())) {
if (current.getDir() == previous.getDir() && (Math.abs(previous.getMaxY() - current.getMaxY()) < THRESHOLD || Math.abs(previous.getMinY() - current.getMinY()) < THRESHOLD) && (previous.getNumberOfLines() >= 2 && current.getNumberOfLines() >= 2 /* && current.getNumberOfLines() <= 10 */ && previous.getNumberOfLines() <= current.getNumberOfLines())) {
previous.getSequences().addAll(current.getSequences());
previous = buildTextBlock(previous.getSequences(), 0);
itty.remove();
@ -78,16 +82,17 @@ public class DocstrumBlockificationService {
continue;
}
if ((Math.abs(previous.getMaxY() - current.getMaxY()) < THRESHOLD || Math.abs(previous.getMinY() - current.getMinY()) < THRESHOLD) && (previous.getNumberOfLines() == 1 && current.getNumberOfLines() >= 1 /* && current.getNumberOfLines() <= 10 */ || previous.getNumberOfLines() == 2 && current.getNumberOfLines() == 1)) {
if (current.getDir() == previous.getDir() && (Math.abs(previous.getMaxY() - current.getMaxY()) < THRESHOLD || Math.abs(previous.getMinY() - current.getMinY()) < THRESHOLD) && (previous.getNumberOfLines() == 1 && current.getNumberOfLines() >= 1 /* && current.getNumberOfLines() <= 10 */ || previous.getNumberOfLines() == 2 && current.getNumberOfLines() == 1)) {
previous.getSequences().addAll(current.getSequences());
previous = buildTextBlock(previous.getSequences(), 0);
itty.remove();
continue;
}
if (previous.containsBlock(current, THRESHOLD)) {
if (current.getDir() == previous.getDir() && previous.containsBlock(current, THRESHOLD)) {
previous.getSequences().addAll(current.getSequences());
previous = buildTextBlock(previous.getSequences().stream().sorted(new TextPositionSequenceComparator()).collect(Collectors.toList()), 0);
QuickSort.sort(previous.getSequences(), new TextPositionSequenceComparator());
previous = buildTextBlock(previous.getSequences(), 0);
itty.remove();
continue;
}
@ -98,6 +103,40 @@ public class DocstrumBlockificationService {
}
private void mergeZones(List<AbstractPageBlock> zones) {
ListIterator<AbstractPageBlock> itty = zones.listIterator();
while (itty.hasNext()) {
AbstractPageBlock block = itty.next();
if (block instanceof TablePageBlock) {
continue;
}
TextPageBlock current = (TextPageBlock) block;
List<AbstractPageBlock> toBeRemoved = new ArrayList<>();
for (AbstractPageBlock innerZone : zones) {
if (innerZone == current) {
continue;
}
if (innerZone instanceof TablePageBlock) {
continue;
}
TextPageBlock inner = (TextPageBlock) innerZone;
if (current.getDir() == inner.getDir() && current.almostIntersects(inner, 0, 0)) {
current.getSequences().addAll(inner.getSequences());
current = buildTextBlock(inner.getSequences().stream().sorted(new TextPositionSequenceComparator()).collect(Collectors.toList()), 0);
}
}
zones.removeAll(toBeRemoved);
}
}
public List<AbstractPageBlock> splitZonesAtRulings(List<TextPositionSequence> textPositions, List<Ruling> horizontalRulingLines, List<Ruling> verticalRulingLines) {
int indexOnPage = 0;

View File

@ -1,10 +1,12 @@
package com.knecon.fforesight.service.layoutparser.processor.services.docstrum;
import java.util.ArrayList;
import java.util.List;
import java.util.stream.Collectors;
import org.springframework.stereotype.Service;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextDirection;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.Character;
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.Zone;
@ -29,20 +31,29 @@ public class DocstrumSegmentationService {
public List<Zone> segmentPage(List<TextPositionSequence> textPositions, boolean xyOder) {
var positions = textPositions.stream().map(TextPositionSequence::getTextPositions).flatMap(List::stream).toList();
List<Zone> zones = new ArrayList<>();
zones.addAll(computeZones(textPositions, TextDirection.ZERO));
zones.addAll(computeZones(textPositions, TextDirection.QUARTER_CIRCLE));
zones.addAll(computeZones(textPositions, TextDirection.HALF_CIRCLE));
zones.addAll(computeZones(textPositions, TextDirection.THREE_QUARTER_CIRCLE));
return readingOrderService.resolve(zones, xyOder);
}
private List<Zone> computeZones(List<TextPositionSequence> textPositions, TextDirection direction) {
var positions = textPositions.stream().filter(t -> t.getDir() == direction).map(TextPositionSequence::getTextPositions).flatMap(List::stream).toList();
var characters = positions.stream().map(Character::new).collect(Collectors.toList());
nearestNeighbourService.findNearestNeighbors(characters);
var characterSpacing = spacingService.computeCharacterSpacing(characters);
var lineSpacing = spacingService.computeLineSpacing(characters);
var lineSpacing = Math.min(spacingService.computeLineSpacing(characters), 20);
var lines = lineBuilderService.buildLines(characters, characterSpacing, lineSpacing);
var zones = zoneBuilderService.buildZones(lines, characterSpacing, lineSpacing);
return readingOrderService.resolve(zones, xyOder);
return zoneBuilderService.buildZones(lines, characterSpacing, lineSpacing);
}
}

View File

@ -31,7 +31,8 @@ public class LineBuilderService {
character.getNeighbors().forEach(neighbor -> {
double x = neighbor.getHorizontalDistance() / maxHorizontalDistance;
double y = neighbor.getVerticalDistance() / maxVerticalDistance;
if (filter.matches(neighbor) && Math.pow(x, 2) + Math.pow(y, 2) <= 1) {
if (character.getTextPosition().getDir() == neighbor.getCharacter().getTextPosition().getDir() && filter.matches(neighbor) && Math.pow(x, 2) + Math.pow(y,
2) <= 1) {
sets.union(character, neighbor.getCharacter());
}
});
@ -40,6 +41,7 @@ public class LineBuilderService {
List<Line> lines = new ArrayList<>();
sets.forEach(group -> {
List<Character> lineCharacters = new ArrayList<>(group);
// QuickSort.sort(lineCharacters, new CharacterComparator());
lineCharacters.sort(Comparator.comparingDouble(Character::getX));
lines.add(new Line(lineCharacters, characterSpacing));
});

View File

@ -3,6 +3,7 @@ package com.knecon.fforesight.service.layoutparser.processor.services.docstrum.s
import java.util.ArrayList;
import java.util.Comparator;
import java.util.List;
import java.util.ListIterator;
import java.util.Set;
import org.springframework.stereotype.Service;
@ -64,17 +65,46 @@ public class ZoneBuilderService {
List<Zone> zones = new ArrayList<>();
sets.forEach(group -> {
zones.add(mergeLinesInZone(new ArrayList<>(group), characterSpacing, lineSpacing));
zones.add(new Zone(new ArrayList<>(group)));
});
if (zones.size() > MAX_ZONES) {
// List<Zone> mergedZones = mergeZones(zones);
List<Zone> finalZones = zones;
if (finalZones.size() > MAX_ZONES) {
List<Line> oneZoneLines = new ArrayList<>();
for (Zone zone : zones) {
for (Zone zone : finalZones) {
oneZoneLines.addAll(zone.getLines());
}
return List.of(mergeLinesInZone(oneZoneLines, characterSpacing, lineSpacing));
}
return finalZones;
}
private List<Zone> mergeZones(List<Zone> zones) {
ListIterator<Zone> itty = zones.listIterator();
while (itty.hasNext()) {
Zone current = itty.next();
for (Zone inner : zones) {
if (inner == current) {
continue;
}
if (current.getBBox().intersects(inner.getBBox())) {
inner.getLines().addAll(current.getLines());
inner.buildBBox();
itty.remove();
break;
}
}
}
return zones;
}
@ -93,6 +123,16 @@ public class ZoneBuilderService {
}
private List<Zone> mergeLinesInZones(List<Zone> zones, double characterSpacing, double lineSpacing) {
List<Zone> merged = new ArrayList<>();
for (Zone zone : zones) {
merged.add(mergeLinesInZone(zone.getLines(), characterSpacing, lineSpacing));
}
return merged;
}
private Zone mergeLinesInZone(List<Line> lines, double characterSpacing, double lineSpacing) {
double maxHorizontalDistance = 0;

View File

@ -0,0 +1,40 @@
package com.knecon.fforesight.service.layoutparser.processor.services.docstrum.utils;
import java.util.Comparator;
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.Character;
public class CharacterComparator implements Comparator<Character> {
@Override
public int compare(Character pos1, Character pos2) {
// only compare text that is in the same direction
int cmp1 = Float.compare(pos1.getTextPosition().getDir(), pos2.getTextPosition().getDir());
if (cmp1 != 0) {
return cmp1;
}
// get the text direction adjusted coordinates
float x1 = pos1.getTextPosition().getXDirAdj();
float x2 = pos2.getTextPosition().getXDirAdj();
float pos1YBottom = pos1.getTextPosition().getYDirAdj();
float pos2YBottom = pos2.getTextPosition().getYDirAdj();
// note that the coordinates have been adjusted so 0,0 is in upper left
float pos1YTop = pos1YBottom - pos1.getTextPosition().getHeightDir();
float pos2YTop = pos2YBottom - pos2.getTextPosition().getHeightDir();
float yDifference = Math.abs(pos1YBottom - pos2YBottom);
// we will do a simple tolerance comparison
if (yDifference < .1 || pos2YBottom >= pos1YTop && pos2YBottom <= pos1YBottom || pos1YBottom >= pos2YTop && pos1YBottom <= pos2YBottom) {
return Float.compare(x1, x2);
} else if (pos1YBottom < pos2YBottom) {
return -1;
} else {
return 1;
}
}
}

View File

@ -237,8 +237,13 @@ public class PDFLinesTextStripper extends PDFTextStripper {
int startIndex = 0;
RedTextPosition previous = null;
float direction = -1;
for (int i = 0; i <= textPositions.size() - 1; i++) {
if (direction == -1) {
direction = textPositions.get(i).getDir();
}
if (!textPositionSequences.isEmpty()) {
previous = textPositionSequences.get(textPositionSequences.size() - 1)
.getTextPositions()
@ -250,6 +255,13 @@ public class PDFLinesTextStripper extends PDFTextStripper {
continue;
}
if (textPositions.get(i).getDir() != direction && startIndex != i) {
List<TextPosition> sublist = textPositions.subList(startIndex, i);
textPositionSequences.add(new TextPositionSequence(sublist, pageNumber, i == textPositions.size() - 1 && isParagraphStart));
startIndex = i;
direction = textPositions.get(i).getDir();
}
// Strange but sometimes this is happening, for example: Metolachlor2.pdf
if (checkIfCurrentPositionIsToTheRightOfPreviousPosition(i, textPositions)) {
List<TextPosition> sublist = textPositions.subList(startIndex, i);
@ -329,6 +341,7 @@ public class PDFLinesTextStripper extends PDFTextStripper {
.getXDirAdj() - (previous.getXDirAdj() + previous.getWidthDirAdj()) < maximumGapSize;
}
@Override
public String getText(PDDocument doc) throws IOException {

View File

@ -26,7 +26,7 @@ public class ViewerDocumentTest extends BuildDocumentTest {
@SneakyThrows
public void testViewerDocument() {
String fileName = "files/S-Metolachlor_RAR_01_Volume_1_2018-09-06.pdf";
String fileName = "files/new/RotateTestFile.pdf";
String tmpFileName = "/tmp/" + Path.of(fileName).getFileName() + "_VIEWER.pdf";
var documentFile = new ClassPathResource(fileName).getFile();