RED-7141: Fixed problem with different text directions
This commit is contained in:
parent
385d4b399e
commit
32c877e8f7
@ -316,7 +316,7 @@ public class LayoutParsingPipeline {
|
||||
log.info("Building Sections for {}", identifier);
|
||||
|
||||
if (layoutParsingType == DOCSTRUM || layoutParsingType == DOCSTRUM_XY) {
|
||||
// Currently for debugging return paragraphs as sections, becaus there is a merging logic in sectionBuilder
|
||||
// Currently for debugging return paragraphs as sections, because there is a merging logic in sectionBuilder
|
||||
List<ClassificationSection> sections = new ArrayList<>();
|
||||
for (var page : classificationPages) {
|
||||
page.getTextBlocks().forEach(block -> {
|
||||
|
||||
@ -102,6 +102,12 @@ public abstract class AbstractPageBlock {
|
||||
}
|
||||
|
||||
|
||||
public boolean intersects(AbstractPageBlock apb) {
|
||||
|
||||
return this.minY < apb.getMaxY() && this.maxY >= apb.getMinY() && this.minX < apb.getMaxX() && this.maxX > apb.getMinX();
|
||||
}
|
||||
|
||||
|
||||
public abstract boolean isEmpty();
|
||||
|
||||
}
|
||||
|
||||
@ -45,6 +45,9 @@ public class RedTextPosition {
|
||||
@JsonIgnore
|
||||
private String fontName;
|
||||
|
||||
@JsonIgnore
|
||||
TextPosition textPosition;
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
public static RedTextPosition fromTextPosition(TextPosition textPosition) {
|
||||
@ -63,6 +66,7 @@ public class RedTextPosition {
|
||||
position[3] = textPosition.getHeightDir();
|
||||
|
||||
pos.setPosition(position);
|
||||
pos.textPosition = textPosition;
|
||||
return pos;
|
||||
}
|
||||
|
||||
|
||||
@ -19,6 +19,7 @@ import com.knecon.fforesight.service.layoutparser.processor.model.text.StringFre
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.DocstrumSegmentationService;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.QuickSort;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.RulingTextDirAdjustUtil;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.TextPositionSequenceComparator;
|
||||
|
||||
@ -48,6 +49,7 @@ public class DocstrumBlockificationService {
|
||||
});
|
||||
|
||||
abstractPageBlocks.addAll(splitZonesAtRulings(textPositionSequences, horizontalRulingLines, verticalRulingLines));
|
||||
// abstractPageBlocks.add(buildTextBlock(textPositionSequences, 0));
|
||||
});
|
||||
|
||||
return new ClassificationPage(abstractPageBlocks);
|
||||
@ -56,6 +58,8 @@ public class DocstrumBlockificationService {
|
||||
|
||||
public void combineBlocks(ClassificationPage page) {
|
||||
|
||||
mergeZones(page.getTextBlocks());
|
||||
|
||||
TextPageBlock previous = new TextPageBlock();
|
||||
ListIterator<AbstractPageBlock> itty = page.getTextBlocks().listIterator();
|
||||
while (itty.hasNext()) {
|
||||
@ -65,9 +69,9 @@ public class DocstrumBlockificationService {
|
||||
}
|
||||
TextPageBlock current = (TextPageBlock) block;
|
||||
|
||||
if (previous != null) {
|
||||
if (previous != null && !previous.getSequences().isEmpty()) {
|
||||
|
||||
if ((Math.abs(previous.getMaxY() - current.getMaxY()) < THRESHOLD || Math.abs(previous.getMinY() - current.getMinY()) < THRESHOLD) && (previous.getNumberOfLines() >= 2 && current.getNumberOfLines() >= 2 /* && current.getNumberOfLines() <= 10 */ && previous.getNumberOfLines() <= current.getNumberOfLines())) {
|
||||
if (current.getDir() == previous.getDir() && (Math.abs(previous.getMaxY() - current.getMaxY()) < THRESHOLD || Math.abs(previous.getMinY() - current.getMinY()) < THRESHOLD) && (previous.getNumberOfLines() >= 2 && current.getNumberOfLines() >= 2 /* && current.getNumberOfLines() <= 10 */ && previous.getNumberOfLines() <= current.getNumberOfLines())) {
|
||||
previous.getSequences().addAll(current.getSequences());
|
||||
previous = buildTextBlock(previous.getSequences(), 0);
|
||||
itty.remove();
|
||||
@ -78,16 +82,17 @@ public class DocstrumBlockificationService {
|
||||
continue;
|
||||
}
|
||||
|
||||
if ((Math.abs(previous.getMaxY() - current.getMaxY()) < THRESHOLD || Math.abs(previous.getMinY() - current.getMinY()) < THRESHOLD) && (previous.getNumberOfLines() == 1 && current.getNumberOfLines() >= 1 /* && current.getNumberOfLines() <= 10 */ || previous.getNumberOfLines() == 2 && current.getNumberOfLines() == 1)) {
|
||||
if (current.getDir() == previous.getDir() && (Math.abs(previous.getMaxY() - current.getMaxY()) < THRESHOLD || Math.abs(previous.getMinY() - current.getMinY()) < THRESHOLD) && (previous.getNumberOfLines() == 1 && current.getNumberOfLines() >= 1 /* && current.getNumberOfLines() <= 10 */ || previous.getNumberOfLines() == 2 && current.getNumberOfLines() == 1)) {
|
||||
previous.getSequences().addAll(current.getSequences());
|
||||
previous = buildTextBlock(previous.getSequences(), 0);
|
||||
itty.remove();
|
||||
continue;
|
||||
}
|
||||
|
||||
if (previous.containsBlock(current, THRESHOLD)) {
|
||||
if (current.getDir() == previous.getDir() && previous.containsBlock(current, THRESHOLD)) {
|
||||
previous.getSequences().addAll(current.getSequences());
|
||||
previous = buildTextBlock(previous.getSequences().stream().sorted(new TextPositionSequenceComparator()).collect(Collectors.toList()), 0);
|
||||
QuickSort.sort(previous.getSequences(), new TextPositionSequenceComparator());
|
||||
previous = buildTextBlock(previous.getSequences(), 0);
|
||||
itty.remove();
|
||||
continue;
|
||||
}
|
||||
@ -98,6 +103,40 @@ public class DocstrumBlockificationService {
|
||||
}
|
||||
|
||||
|
||||
private void mergeZones(List<AbstractPageBlock> zones) {
|
||||
|
||||
ListIterator<AbstractPageBlock> itty = zones.listIterator();
|
||||
|
||||
while (itty.hasNext()) {
|
||||
AbstractPageBlock block = itty.next();
|
||||
if (block instanceof TablePageBlock) {
|
||||
continue;
|
||||
}
|
||||
|
||||
TextPageBlock current = (TextPageBlock) block;
|
||||
|
||||
List<AbstractPageBlock> toBeRemoved = new ArrayList<>();
|
||||
for (AbstractPageBlock innerZone : zones) {
|
||||
if (innerZone == current) {
|
||||
continue;
|
||||
}
|
||||
if (innerZone instanceof TablePageBlock) {
|
||||
continue;
|
||||
}
|
||||
|
||||
TextPageBlock inner = (TextPageBlock) innerZone;
|
||||
|
||||
if (current.getDir() == inner.getDir() && current.almostIntersects(inner, 0, 0)) {
|
||||
|
||||
current.getSequences().addAll(inner.getSequences());
|
||||
current = buildTextBlock(inner.getSequences().stream().sorted(new TextPositionSequenceComparator()).collect(Collectors.toList()), 0);
|
||||
}
|
||||
}
|
||||
zones.removeAll(toBeRemoved);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
public List<AbstractPageBlock> splitZonesAtRulings(List<TextPositionSequence> textPositions, List<Ruling> horizontalRulingLines, List<Ruling> verticalRulingLines) {
|
||||
|
||||
int indexOnPage = 0;
|
||||
|
||||
@ -1,10 +1,12 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.services.docstrum;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextDirection;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.Character;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.Zone;
|
||||
@ -29,20 +31,29 @@ public class DocstrumSegmentationService {
|
||||
|
||||
public List<Zone> segmentPage(List<TextPositionSequence> textPositions, boolean xyOder) {
|
||||
|
||||
var positions = textPositions.stream().map(TextPositionSequence::getTextPositions).flatMap(List::stream).toList();
|
||||
List<Zone> zones = new ArrayList<>();
|
||||
zones.addAll(computeZones(textPositions, TextDirection.ZERO));
|
||||
zones.addAll(computeZones(textPositions, TextDirection.QUARTER_CIRCLE));
|
||||
zones.addAll(computeZones(textPositions, TextDirection.HALF_CIRCLE));
|
||||
zones.addAll(computeZones(textPositions, TextDirection.THREE_QUARTER_CIRCLE));
|
||||
|
||||
return readingOrderService.resolve(zones, xyOder);
|
||||
}
|
||||
|
||||
|
||||
private List<Zone> computeZones(List<TextPositionSequence> textPositions, TextDirection direction) {
|
||||
|
||||
var positions = textPositions.stream().filter(t -> t.getDir() == direction).map(TextPositionSequence::getTextPositions).flatMap(List::stream).toList();
|
||||
|
||||
var characters = positions.stream().map(Character::new).collect(Collectors.toList());
|
||||
|
||||
nearestNeighbourService.findNearestNeighbors(characters);
|
||||
|
||||
var characterSpacing = spacingService.computeCharacterSpacing(characters);
|
||||
var lineSpacing = spacingService.computeLineSpacing(characters);
|
||||
var lineSpacing = Math.min(spacingService.computeLineSpacing(characters), 20);
|
||||
|
||||
var lines = lineBuilderService.buildLines(characters, characterSpacing, lineSpacing);
|
||||
|
||||
var zones = zoneBuilderService.buildZones(lines, characterSpacing, lineSpacing);
|
||||
|
||||
return readingOrderService.resolve(zones, xyOder);
|
||||
return zoneBuilderService.buildZones(lines, characterSpacing, lineSpacing);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@ -31,7 +31,8 @@ public class LineBuilderService {
|
||||
character.getNeighbors().forEach(neighbor -> {
|
||||
double x = neighbor.getHorizontalDistance() / maxHorizontalDistance;
|
||||
double y = neighbor.getVerticalDistance() / maxVerticalDistance;
|
||||
if (filter.matches(neighbor) && Math.pow(x, 2) + Math.pow(y, 2) <= 1) {
|
||||
if (character.getTextPosition().getDir() == neighbor.getCharacter().getTextPosition().getDir() && filter.matches(neighbor) && Math.pow(x, 2) + Math.pow(y,
|
||||
2) <= 1) {
|
||||
sets.union(character, neighbor.getCharacter());
|
||||
}
|
||||
});
|
||||
@ -40,6 +41,7 @@ public class LineBuilderService {
|
||||
List<Line> lines = new ArrayList<>();
|
||||
sets.forEach(group -> {
|
||||
List<Character> lineCharacters = new ArrayList<>(group);
|
||||
// QuickSort.sort(lineCharacters, new CharacterComparator());
|
||||
lineCharacters.sort(Comparator.comparingDouble(Character::getX));
|
||||
lines.add(new Line(lineCharacters, characterSpacing));
|
||||
});
|
||||
|
||||
@ -3,6 +3,7 @@ package com.knecon.fforesight.service.layoutparser.processor.services.docstrum.s
|
||||
import java.util.ArrayList;
|
||||
import java.util.Comparator;
|
||||
import java.util.List;
|
||||
import java.util.ListIterator;
|
||||
import java.util.Set;
|
||||
|
||||
import org.springframework.stereotype.Service;
|
||||
@ -64,17 +65,46 @@ public class ZoneBuilderService {
|
||||
|
||||
List<Zone> zones = new ArrayList<>();
|
||||
sets.forEach(group -> {
|
||||
zones.add(mergeLinesInZone(new ArrayList<>(group), characterSpacing, lineSpacing));
|
||||
zones.add(new Zone(new ArrayList<>(group)));
|
||||
});
|
||||
|
||||
if (zones.size() > MAX_ZONES) {
|
||||
// List<Zone> mergedZones = mergeZones(zones);
|
||||
|
||||
List<Zone> finalZones = zones;
|
||||
|
||||
if (finalZones.size() > MAX_ZONES) {
|
||||
List<Line> oneZoneLines = new ArrayList<>();
|
||||
for (Zone zone : zones) {
|
||||
for (Zone zone : finalZones) {
|
||||
oneZoneLines.addAll(zone.getLines());
|
||||
}
|
||||
return List.of(mergeLinesInZone(oneZoneLines, characterSpacing, lineSpacing));
|
||||
}
|
||||
|
||||
return finalZones;
|
||||
}
|
||||
|
||||
|
||||
private List<Zone> mergeZones(List<Zone> zones) {
|
||||
|
||||
ListIterator<Zone> itty = zones.listIterator();
|
||||
|
||||
while (itty.hasNext()) {
|
||||
|
||||
Zone current = itty.next();
|
||||
|
||||
for (Zone inner : zones) {
|
||||
if (inner == current) {
|
||||
continue;
|
||||
}
|
||||
if (current.getBBox().intersects(inner.getBBox())) {
|
||||
inner.getLines().addAll(current.getLines());
|
||||
inner.buildBBox();
|
||||
itty.remove();
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return zones;
|
||||
}
|
||||
|
||||
@ -93,6 +123,16 @@ public class ZoneBuilderService {
|
||||
}
|
||||
|
||||
|
||||
private List<Zone> mergeLinesInZones(List<Zone> zones, double characterSpacing, double lineSpacing) {
|
||||
|
||||
List<Zone> merged = new ArrayList<>();
|
||||
for (Zone zone : zones) {
|
||||
merged.add(mergeLinesInZone(zone.getLines(), characterSpacing, lineSpacing));
|
||||
}
|
||||
return merged;
|
||||
}
|
||||
|
||||
|
||||
private Zone mergeLinesInZone(List<Line> lines, double characterSpacing, double lineSpacing) {
|
||||
|
||||
double maxHorizontalDistance = 0;
|
||||
|
||||
@ -0,0 +1,40 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.services.docstrum.utils;
|
||||
|
||||
import java.util.Comparator;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.Character;
|
||||
|
||||
public class CharacterComparator implements Comparator<Character> {
|
||||
|
||||
@Override
|
||||
public int compare(Character pos1, Character pos2) {
|
||||
// only compare text that is in the same direction
|
||||
int cmp1 = Float.compare(pos1.getTextPosition().getDir(), pos2.getTextPosition().getDir());
|
||||
if (cmp1 != 0) {
|
||||
return cmp1;
|
||||
}
|
||||
|
||||
// get the text direction adjusted coordinates
|
||||
float x1 = pos1.getTextPosition().getXDirAdj();
|
||||
float x2 = pos2.getTextPosition().getXDirAdj();
|
||||
|
||||
float pos1YBottom = pos1.getTextPosition().getYDirAdj();
|
||||
float pos2YBottom = pos2.getTextPosition().getYDirAdj();
|
||||
|
||||
// note that the coordinates have been adjusted so 0,0 is in upper left
|
||||
float pos1YTop = pos1YBottom - pos1.getTextPosition().getHeightDir();
|
||||
float pos2YTop = pos2YBottom - pos2.getTextPosition().getHeightDir();
|
||||
|
||||
float yDifference = Math.abs(pos1YBottom - pos2YBottom);
|
||||
|
||||
// we will do a simple tolerance comparison
|
||||
if (yDifference < .1 || pos2YBottom >= pos1YTop && pos2YBottom <= pos1YBottom || pos1YBottom >= pos2YTop && pos1YBottom <= pos2YBottom) {
|
||||
return Float.compare(x1, x2);
|
||||
} else if (pos1YBottom < pos2YBottom) {
|
||||
return -1;
|
||||
} else {
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
@ -237,8 +237,13 @@ public class PDFLinesTextStripper extends PDFTextStripper {
|
||||
int startIndex = 0;
|
||||
RedTextPosition previous = null;
|
||||
|
||||
float direction = -1;
|
||||
for (int i = 0; i <= textPositions.size() - 1; i++) {
|
||||
|
||||
if (direction == -1) {
|
||||
direction = textPositions.get(i).getDir();
|
||||
}
|
||||
|
||||
if (!textPositionSequences.isEmpty()) {
|
||||
previous = textPositionSequences.get(textPositionSequences.size() - 1)
|
||||
.getTextPositions()
|
||||
@ -250,6 +255,13 @@ public class PDFLinesTextStripper extends PDFTextStripper {
|
||||
continue;
|
||||
}
|
||||
|
||||
if (textPositions.get(i).getDir() != direction && startIndex != i) {
|
||||
List<TextPosition> sublist = textPositions.subList(startIndex, i);
|
||||
textPositionSequences.add(new TextPositionSequence(sublist, pageNumber, i == textPositions.size() - 1 && isParagraphStart));
|
||||
startIndex = i;
|
||||
direction = textPositions.get(i).getDir();
|
||||
}
|
||||
|
||||
// Strange but sometimes this is happening, for example: Metolachlor2.pdf
|
||||
if (checkIfCurrentPositionIsToTheRightOfPreviousPosition(i, textPositions)) {
|
||||
List<TextPosition> sublist = textPositions.subList(startIndex, i);
|
||||
@ -329,6 +341,7 @@ public class PDFLinesTextStripper extends PDFTextStripper {
|
||||
.getXDirAdj() - (previous.getXDirAdj() + previous.getWidthDirAdj()) < maximumGapSize;
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public String getText(PDDocument doc) throws IOException {
|
||||
|
||||
|
||||
@ -26,7 +26,7 @@ public class ViewerDocumentTest extends BuildDocumentTest {
|
||||
@SneakyThrows
|
||||
public void testViewerDocument() {
|
||||
|
||||
String fileName = "files/S-Metolachlor_RAR_01_Volume_1_2018-09-06.pdf";
|
||||
String fileName = "files/new/RotateTestFile.pdf";
|
||||
String tmpFileName = "/tmp/" + Path.of(fileName).getFileName() + "_VIEWER.pdf";
|
||||
|
||||
var documentFile = new ClassPathResource(fileName).getFile();
|
||||
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Loading…
x
Reference in New Issue
Block a user