diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingPipeline.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingPipeline.java index 3344f7b..c34a337 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingPipeline.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingPipeline.java @@ -316,7 +316,7 @@ public class LayoutParsingPipeline { log.info("Building Sections for {}", identifier); if (layoutParsingType == DOCSTRUM || layoutParsingType == DOCSTRUM_XY) { - // Currently for debugging return paragraphs as sections, becaus there is a merging logic in sectionBuilder + // Currently for debugging return paragraphs as sections, because there is a merging logic in sectionBuilder List sections = new ArrayList<>(); for (var page : classificationPages) { page.getTextBlocks().forEach(block -> { diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/AbstractPageBlock.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/AbstractPageBlock.java index d4f70fe..92d6989 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/AbstractPageBlock.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/AbstractPageBlock.java @@ -102,6 +102,12 @@ public abstract class AbstractPageBlock { } + public boolean intersects(AbstractPageBlock apb) { + + return this.minY < apb.getMaxY() && this.maxY >= apb.getMinY() && this.minX < apb.getMaxX() && this.maxX > apb.getMinX(); + } + + public abstract boolean isEmpty(); } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/text/RedTextPosition.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/text/RedTextPosition.java index ccea113..9e9cc97 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/text/RedTextPosition.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/text/RedTextPosition.java @@ -45,6 +45,9 @@ public class RedTextPosition { @JsonIgnore private String fontName; + @JsonIgnore + TextPosition textPosition; + @SneakyThrows public static RedTextPosition fromTextPosition(TextPosition textPosition) { @@ -63,6 +66,7 @@ public class RedTextPosition { position[3] = textPosition.getHeightDir(); pos.setPosition(position); + pos.textPosition = textPosition; return pos; } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/blockification/DocstrumBlockificationService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/blockification/DocstrumBlockificationService.java index c0f61fd..8385a60 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/blockification/DocstrumBlockificationService.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/blockification/DocstrumBlockificationService.java @@ -19,6 +19,7 @@ import com.knecon.fforesight.service.layoutparser.processor.model.text.StringFre import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock; import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence; import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.DocstrumSegmentationService; +import com.knecon.fforesight.service.layoutparser.processor.utils.QuickSort; import com.knecon.fforesight.service.layoutparser.processor.utils.RulingTextDirAdjustUtil; import com.knecon.fforesight.service.layoutparser.processor.utils.TextPositionSequenceComparator; @@ -48,6 +49,7 @@ public class DocstrumBlockificationService { }); abstractPageBlocks.addAll(splitZonesAtRulings(textPositionSequences, horizontalRulingLines, verticalRulingLines)); +// abstractPageBlocks.add(buildTextBlock(textPositionSequences, 0)); }); return new ClassificationPage(abstractPageBlocks); @@ -56,6 +58,8 @@ public class DocstrumBlockificationService { public void combineBlocks(ClassificationPage page) { + mergeZones(page.getTextBlocks()); + TextPageBlock previous = new TextPageBlock(); ListIterator itty = page.getTextBlocks().listIterator(); while (itty.hasNext()) { @@ -65,9 +69,9 @@ public class DocstrumBlockificationService { } TextPageBlock current = (TextPageBlock) block; - if (previous != null) { + if (previous != null && !previous.getSequences().isEmpty()) { - if ((Math.abs(previous.getMaxY() - current.getMaxY()) < THRESHOLD || Math.abs(previous.getMinY() - current.getMinY()) < THRESHOLD) && (previous.getNumberOfLines() >= 2 && current.getNumberOfLines() >= 2 /* && current.getNumberOfLines() <= 10 */ && previous.getNumberOfLines() <= current.getNumberOfLines())) { + if (current.getDir() == previous.getDir() && (Math.abs(previous.getMaxY() - current.getMaxY()) < THRESHOLD || Math.abs(previous.getMinY() - current.getMinY()) < THRESHOLD) && (previous.getNumberOfLines() >= 2 && current.getNumberOfLines() >= 2 /* && current.getNumberOfLines() <= 10 */ && previous.getNumberOfLines() <= current.getNumberOfLines())) { previous.getSequences().addAll(current.getSequences()); previous = buildTextBlock(previous.getSequences(), 0); itty.remove(); @@ -78,16 +82,17 @@ public class DocstrumBlockificationService { continue; } - if ((Math.abs(previous.getMaxY() - current.getMaxY()) < THRESHOLD || Math.abs(previous.getMinY() - current.getMinY()) < THRESHOLD) && (previous.getNumberOfLines() == 1 && current.getNumberOfLines() >= 1 /* && current.getNumberOfLines() <= 10 */ || previous.getNumberOfLines() == 2 && current.getNumberOfLines() == 1)) { + if (current.getDir() == previous.getDir() && (Math.abs(previous.getMaxY() - current.getMaxY()) < THRESHOLD || Math.abs(previous.getMinY() - current.getMinY()) < THRESHOLD) && (previous.getNumberOfLines() == 1 && current.getNumberOfLines() >= 1 /* && current.getNumberOfLines() <= 10 */ || previous.getNumberOfLines() == 2 && current.getNumberOfLines() == 1)) { previous.getSequences().addAll(current.getSequences()); previous = buildTextBlock(previous.getSequences(), 0); itty.remove(); continue; } - if (previous.containsBlock(current, THRESHOLD)) { + if (current.getDir() == previous.getDir() && previous.containsBlock(current, THRESHOLD)) { previous.getSequences().addAll(current.getSequences()); - previous = buildTextBlock(previous.getSequences().stream().sorted(new TextPositionSequenceComparator()).collect(Collectors.toList()), 0); + QuickSort.sort(previous.getSequences(), new TextPositionSequenceComparator()); + previous = buildTextBlock(previous.getSequences(), 0); itty.remove(); continue; } @@ -98,6 +103,40 @@ public class DocstrumBlockificationService { } + private void mergeZones(List zones) { + + ListIterator itty = zones.listIterator(); + + while (itty.hasNext()) { + AbstractPageBlock block = itty.next(); + if (block instanceof TablePageBlock) { + continue; + } + + TextPageBlock current = (TextPageBlock) block; + + List toBeRemoved = new ArrayList<>(); + for (AbstractPageBlock innerZone : zones) { + if (innerZone == current) { + continue; + } + if (innerZone instanceof TablePageBlock) { + continue; + } + + TextPageBlock inner = (TextPageBlock) innerZone; + + if (current.getDir() == inner.getDir() && current.almostIntersects(inner, 0, 0)) { + + current.getSequences().addAll(inner.getSequences()); + current = buildTextBlock(inner.getSequences().stream().sorted(new TextPositionSequenceComparator()).collect(Collectors.toList()), 0); + } + } + zones.removeAll(toBeRemoved); + } + } + + public List splitZonesAtRulings(List textPositions, List horizontalRulingLines, List verticalRulingLines) { int indexOnPage = 0; diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/docstrum/DocstrumSegmentationService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/docstrum/DocstrumSegmentationService.java index fbf92da..5fa3e01 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/docstrum/DocstrumSegmentationService.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/docstrum/DocstrumSegmentationService.java @@ -1,10 +1,12 @@ package com.knecon.fforesight.service.layoutparser.processor.services.docstrum; +import java.util.ArrayList; import java.util.List; import java.util.stream.Collectors; import org.springframework.stereotype.Service; +import com.knecon.fforesight.service.layoutparser.processor.model.text.TextDirection; import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence; import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.Character; import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.Zone; @@ -29,20 +31,29 @@ public class DocstrumSegmentationService { public List segmentPage(List textPositions, boolean xyOder) { - var positions = textPositions.stream().map(TextPositionSequence::getTextPositions).flatMap(List::stream).toList(); + List zones = new ArrayList<>(); + zones.addAll(computeZones(textPositions, TextDirection.ZERO)); + zones.addAll(computeZones(textPositions, TextDirection.QUARTER_CIRCLE)); + zones.addAll(computeZones(textPositions, TextDirection.HALF_CIRCLE)); + zones.addAll(computeZones(textPositions, TextDirection.THREE_QUARTER_CIRCLE)); + + return readingOrderService.resolve(zones, xyOder); + } + + + private List computeZones(List textPositions, TextDirection direction) { + + var positions = textPositions.stream().filter(t -> t.getDir() == direction).map(TextPositionSequence::getTextPositions).flatMap(List::stream).toList(); var characters = positions.stream().map(Character::new).collect(Collectors.toList()); nearestNeighbourService.findNearestNeighbors(characters); var characterSpacing = spacingService.computeCharacterSpacing(characters); - var lineSpacing = spacingService.computeLineSpacing(characters); + var lineSpacing = Math.min(spacingService.computeLineSpacing(characters), 20); var lines = lineBuilderService.buildLines(characters, characterSpacing, lineSpacing); - - var zones = zoneBuilderService.buildZones(lines, characterSpacing, lineSpacing); - - return readingOrderService.resolve(zones, xyOder); + return zoneBuilderService.buildZones(lines, characterSpacing, lineSpacing); } } diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/docstrum/service/LineBuilderService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/docstrum/service/LineBuilderService.java index 19ee66c..f4fef2f 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/docstrum/service/LineBuilderService.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/docstrum/service/LineBuilderService.java @@ -31,7 +31,8 @@ public class LineBuilderService { character.getNeighbors().forEach(neighbor -> { double x = neighbor.getHorizontalDistance() / maxHorizontalDistance; double y = neighbor.getVerticalDistance() / maxVerticalDistance; - if (filter.matches(neighbor) && Math.pow(x, 2) + Math.pow(y, 2) <= 1) { + if (character.getTextPosition().getDir() == neighbor.getCharacter().getTextPosition().getDir() && filter.matches(neighbor) && Math.pow(x, 2) + Math.pow(y, + 2) <= 1) { sets.union(character, neighbor.getCharacter()); } }); @@ -40,6 +41,7 @@ public class LineBuilderService { List lines = new ArrayList<>(); sets.forEach(group -> { List lineCharacters = new ArrayList<>(group); +// QuickSort.sort(lineCharacters, new CharacterComparator()); lineCharacters.sort(Comparator.comparingDouble(Character::getX)); lines.add(new Line(lineCharacters, characterSpacing)); }); diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/docstrum/service/ZoneBuilderService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/docstrum/service/ZoneBuilderService.java index 3ddc2d2..16e76d5 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/docstrum/service/ZoneBuilderService.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/docstrum/service/ZoneBuilderService.java @@ -3,6 +3,7 @@ package com.knecon.fforesight.service.layoutparser.processor.services.docstrum.s import java.util.ArrayList; import java.util.Comparator; import java.util.List; +import java.util.ListIterator; import java.util.Set; import org.springframework.stereotype.Service; @@ -64,17 +65,46 @@ public class ZoneBuilderService { List zones = new ArrayList<>(); sets.forEach(group -> { - zones.add(mergeLinesInZone(new ArrayList<>(group), characterSpacing, lineSpacing)); + zones.add(new Zone(new ArrayList<>(group))); }); - if (zones.size() > MAX_ZONES) { +// List mergedZones = mergeZones(zones); + + List finalZones = zones; + + if (finalZones.size() > MAX_ZONES) { List oneZoneLines = new ArrayList<>(); - for (Zone zone : zones) { + for (Zone zone : finalZones) { oneZoneLines.addAll(zone.getLines()); } return List.of(mergeLinesInZone(oneZoneLines, characterSpacing, lineSpacing)); } + return finalZones; + } + + + private List mergeZones(List zones) { + + ListIterator itty = zones.listIterator(); + + while (itty.hasNext()) { + + Zone current = itty.next(); + + for (Zone inner : zones) { + if (inner == current) { + continue; + } + if (current.getBBox().intersects(inner.getBBox())) { + inner.getLines().addAll(current.getLines()); + inner.buildBBox(); + itty.remove(); + break; + } + } + } + return zones; } @@ -93,6 +123,16 @@ public class ZoneBuilderService { } + private List mergeLinesInZones(List zones, double characterSpacing, double lineSpacing) { + + List merged = new ArrayList<>(); + for (Zone zone : zones) { + merged.add(mergeLinesInZone(zone.getLines(), characterSpacing, lineSpacing)); + } + return merged; + } + + private Zone mergeLinesInZone(List lines, double characterSpacing, double lineSpacing) { double maxHorizontalDistance = 0; diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/docstrum/utils/CharacterComparator.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/docstrum/utils/CharacterComparator.java new file mode 100644 index 0000000..b63688d --- /dev/null +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/docstrum/utils/CharacterComparator.java @@ -0,0 +1,40 @@ +package com.knecon.fforesight.service.layoutparser.processor.services.docstrum.utils; + +import java.util.Comparator; + +import com.knecon.fforesight.service.layoutparser.processor.services.docstrum.model.Character; + +public class CharacterComparator implements Comparator { + + @Override + public int compare(Character pos1, Character pos2) { + // only compare text that is in the same direction + int cmp1 = Float.compare(pos1.getTextPosition().getDir(), pos2.getTextPosition().getDir()); + if (cmp1 != 0) { + return cmp1; + } + + // get the text direction adjusted coordinates + float x1 = pos1.getTextPosition().getXDirAdj(); + float x2 = pos2.getTextPosition().getXDirAdj(); + + float pos1YBottom = pos1.getTextPosition().getYDirAdj(); + float pos2YBottom = pos2.getTextPosition().getYDirAdj(); + + // note that the coordinates have been adjusted so 0,0 is in upper left + float pos1YTop = pos1YBottom - pos1.getTextPosition().getHeightDir(); + float pos2YTop = pos2YBottom - pos2.getTextPosition().getHeightDir(); + + float yDifference = Math.abs(pos1YBottom - pos2YBottom); + + // we will do a simple tolerance comparison + if (yDifference < .1 || pos2YBottom >= pos1YTop && pos2YBottom <= pos1YBottom || pos1YBottom >= pos2YTop && pos1YBottom <= pos2YBottom) { + return Float.compare(x1, x2); + } else if (pos1YBottom < pos2YBottom) { + return -1; + } else { + return 1; + } + } + +} diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/parsing/PDFLinesTextStripper.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/parsing/PDFLinesTextStripper.java index 09a8eb2..18e5a5a 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/parsing/PDFLinesTextStripper.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/parsing/PDFLinesTextStripper.java @@ -237,8 +237,13 @@ public class PDFLinesTextStripper extends PDFTextStripper { int startIndex = 0; RedTextPosition previous = null; + float direction = -1; for (int i = 0; i <= textPositions.size() - 1; i++) { + if (direction == -1) { + direction = textPositions.get(i).getDir(); + } + if (!textPositionSequences.isEmpty()) { previous = textPositionSequences.get(textPositionSequences.size() - 1) .getTextPositions() @@ -250,6 +255,13 @@ public class PDFLinesTextStripper extends PDFTextStripper { continue; } + if (textPositions.get(i).getDir() != direction && startIndex != i) { + List sublist = textPositions.subList(startIndex, i); + textPositionSequences.add(new TextPositionSequence(sublist, pageNumber, i == textPositions.size() - 1 && isParagraphStart)); + startIndex = i; + direction = textPositions.get(i).getDir(); + } + // Strange but sometimes this is happening, for example: Metolachlor2.pdf if (checkIfCurrentPositionIsToTheRightOfPreviousPosition(i, textPositions)) { List sublist = textPositions.subList(startIndex, i); @@ -329,6 +341,7 @@ public class PDFLinesTextStripper extends PDFTextStripper { .getXDirAdj() - (previous.getXDirAdj() + previous.getWidthDirAdj()) < maximumGapSize; } + @Override public String getText(PDDocument doc) throws IOException { diff --git a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/ViewerDocumentTest.java b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/ViewerDocumentTest.java index 264e2da..022bcb9 100644 --- a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/ViewerDocumentTest.java +++ b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/ViewerDocumentTest.java @@ -26,7 +26,7 @@ public class ViewerDocumentTest extends BuildDocumentTest { @SneakyThrows public void testViewerDocument() { - String fileName = "files/S-Metolachlor_RAR_01_Volume_1_2018-09-06.pdf"; + String fileName = "files/new/RotateTestFile.pdf"; String tmpFileName = "/tmp/" + Path.of(fileName).getFileName() + "_VIEWER.pdf"; var documentFile = new ClassPathResource(fileName).getFile(); diff --git a/layoutparser-service/layoutparser-service-server/src/test/resources/files/174.pdf b/layoutparser-service/layoutparser-service-server/src/test/resources/files/174.pdf new file mode 100644 index 0000000..27e6c3b Binary files /dev/null and b/layoutparser-service/layoutparser-service-server/src/test/resources/files/174.pdf differ diff --git a/layoutparser-service/layoutparser-service-server/src/test/resources/files/218.pdf b/layoutparser-service/layoutparser-service-server/src/test/resources/files/218.pdf new file mode 100644 index 0000000..ba016fb Binary files /dev/null and b/layoutparser-service/layoutparser-service-server/src/test/resources/files/218.pdf differ diff --git a/layoutparser-service/layoutparser-service-server/src/test/resources/files/S10.pdf b/layoutparser-service/layoutparser-service-server/src/test/resources/files/S10.pdf new file mode 100644 index 0000000..d069abc Binary files /dev/null and b/layoutparser-service/layoutparser-service-server/src/test/resources/files/S10.pdf differ diff --git a/layoutparser-service/layoutparser-service-server/src/test/resources/files/S91.pdf b/layoutparser-service/layoutparser-service-server/src/test/resources/files/S91.pdf new file mode 100644 index 0000000..2c5b19c Binary files /dev/null and b/layoutparser-service/layoutparser-service-server/src/test/resources/files/S91.pdf differ