diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/TableExtractionService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/TableExtractionService.java index 4af2a04..b538ec0 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/TableExtractionService.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/TableExtractionService.java @@ -122,11 +122,14 @@ public class TableExtractionService { private boolean doesCellContainTextBlock(Cell cell, TextPageBlock textBlock) { + if(cell.isEmpty() || textBlock.getSequences().isEmpty()) { + return false; + } double x = textBlock.getPdfMinX(); double y = textBlock.getPdfMinY(); double w = textBlock.getPdfMaxX() - textBlock.getPdfMinX(); double h = textBlock.getPdfMaxY() - textBlock.getPdfMinY(); - if (cell.isEmpty() || w <= 0 || h <= 0) { + if (w <= 0 || h <= 0) { return false; } double x0 = cell.getX(); diff --git a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/blockification/BlockificationPostprocessingService.java b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/blockification/BlockificationPostprocessingService.java index 1930ce7..f7a9f3b 100644 --- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/blockification/BlockificationPostprocessingService.java +++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/blockification/BlockificationPostprocessingService.java @@ -20,6 +20,7 @@ import com.knecon.fforesight.service.layoutparser.processor.services.factory.Sea import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations; import lombok.Data; +import lombok.NonNull; @Service public class BlockificationPostprocessingService { @@ -119,7 +120,7 @@ public class BlockificationPostprocessingService { double minDistance = Math.min(distanceToDirectMatch, Math.min(distanceToSplitCandidate, distanceToBestMergeCandidates)); - if(minDistance == Double.MAX_VALUE) { + if (minDistance == Double.MAX_VALUE) { return; } if (minDistance == distanceToDirectMatch) { @@ -140,9 +141,6 @@ public class BlockificationPostprocessingService { List otherBlocks = new ArrayList<>(); int blockToSplitIdx = classificationPage.getTextBlocks().indexOf(blockToSplit); WordSequenceResult wordSequenceResult = findWordSequence(blockToSplit.getSequences(), text); - List postSequence = blockToSplit.getSequences(); - postSequence.removeAll(wordSequenceResult.inSequence); - postSequence.removeAll(wordSequenceResult.preSequence); blockToSplit.setSequences(wordSequenceResult.inSequence); @@ -152,8 +150,8 @@ public class BlockificationPostprocessingService { otherBlocks.add(block); blockToSplitIdx++; } - if (!postSequence.isEmpty()) { - TextPageBlock block = buildTextBlock(postSequence, 0); + if (!wordSequenceResult.postSequence.isEmpty()) { + TextPageBlock block = buildTextBlock(wordSequenceResult.postSequence, 0); classificationPage.getTextBlocks().add(blockToSplitIdx + 1, block); otherBlocks.add(block); } @@ -161,8 +159,129 @@ public class BlockificationPostprocessingService { } + public static class WordSequenceResult { + + public List inSequence; + public List preSequence; + public List postSequence; + + + public WordSequenceResult(List inSequence, List preSequence, List postSequence) { + + this.inSequence = inSequence; + this.preSequence = preSequence; + this.postSequence = postSequence; + } + + + public WordSequenceResult() { + + this.inSequence = new ArrayList<>(); + this.preSequence = new ArrayList<>(); + this.postSequence = new ArrayList<>(); + } + + } + + private static WordSequenceResult findWordSequence(List textPositionSequences, String text) { + String target = sanitizeString(text); + List inSequence = new ArrayList<>(); + List preSequence = new ArrayList<>(); + List postSequence = new ArrayList<>(); + StringBuilder currentSequence = new StringBuilder(); + + for (TextPositionSequence sequence : textPositionSequences) { + + currentSequence.append(sanitizeString(sequence.toString())); + inSequence.add(sequence); + + if (currentSequence.length() >= target.length()) { + + if (currentSequence.toString().endsWith(target)) { + + int index = 0; + String toRemove = currentSequence.substring(0, currentSequence.length() - target.length()); + + TextPositionSequence next = inSequence.get(index); + while (currentSequence.length() - next.length() >= target.length()) { + + TextPositionSequence removed = inSequence.remove(index); + currentSequence.delete(0, removed.toString().length()); + preSequence.add(removed); + + next = inSequence.get(index); + toRemove = toRemove.substring(removed.length()); + } + + if (!toRemove.isEmpty()) { + SplitSequenceResult splitSequenceResult = splitSequence(inSequence.remove(index), toRemove); + + currentSequence.delete(0, splitSequenceResult.out.length()); + preSequence.add(splitSequenceResult.out); + inSequence.add(index, splitSequenceResult.in); + } + + } else if (currentSequence.toString().startsWith(target)) { + + int index = inSequence.size() - 1; + String toRemove = currentSequence.substring(target.length()); + + SplitSequenceResult splitSequenceResult = splitSequence(inSequence.remove(index), toRemove); + currentSequence.delete(currentSequence.length() - splitSequenceResult.out.length(), currentSequence.length()); + + inSequence.add(index, splitSequenceResult.in); + postSequence.add(splitSequenceResult.out); + } + + if (currentSequence.toString().equals(target)) { + postSequence.addAll(textPositionSequences.subList(textPositionSequences.indexOf(sequence) + 1, textPositionSequences.size())); + return new WordSequenceResult(inSequence, preSequence, postSequence); + } + } + } + + return new WordSequenceResult(); + } + + + private static SplitSequenceResult splitSequence(TextPositionSequence sequence, String toRemove) { + + TextPositionSequence in = null; + TextPositionSequence out; + + String currentSequence = sequence.toString(); + int index = currentSequence.indexOf(toRemove); + int endIndex = index + toRemove.length(); + + out = createSubSequence(sequence, index, endIndex); + + if (index > 0) { + in = createSubSequence(sequence, 0, index); + } else if (endIndex < sequence.getTextPositions().size()) { + in = createSubSequence(sequence, endIndex, sequence.getTextPositions().size()); + } + + return new SplitSequenceResult(in, out); + } + + + public record SplitSequenceResult(TextPositionSequence in, TextPositionSequence out) { + + } + + + private static TextPositionSequence createSubSequence(TextPositionSequence sequence, int start, int end) { + + TextPositionSequence newSeq = new TextPositionSequence(new ArrayList<>(sequence.getTextPositions().subList(start, end)), sequence.getPage()); + newSeq.setParagraphStart(sequence.isParagraphStart()); + return newSeq; + } + + + private static WordSequenceResultOld findWordSequenceOld(List textPositionSequences, String text) { + String target = sanitizeString(text); List inSequence = new ArrayList<>(); List preSequence = new ArrayList<>(); @@ -186,10 +305,10 @@ public class BlockificationPostprocessingService { } if (currentSequence.toString().equals(target)) { - return new WordSequenceResult(inSequence, preSequence); + return new WordSequenceResultOld(inSequence, preSequence); } } - return new WordSequenceResult(new ArrayList<>(), new ArrayList<>()); + return new WordSequenceResultOld(new ArrayList<>(), new ArrayList<>()); } @@ -258,7 +377,6 @@ public class BlockificationPostprocessingService { } - // currently only three cases are handled here: // 1. equality // 2. outline title contains block text @@ -301,7 +419,7 @@ public class BlockificationPostprocessingService { } - private record WordSequenceResult(List inSequence, List preSequence) { + private record WordSequenceResultOld(List inSequence, List preSequence) { } diff --git a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/ViewerDocumentTest.java b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/ViewerDocumentTest.java index 5e5028a..4a590c6 100644 --- a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/ViewerDocumentTest.java +++ b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/ViewerDocumentTest.java @@ -83,7 +83,10 @@ public class ViewerDocumentTest extends BuildDocumentTest { public void testViewerDocument() { - String fileName = "files/documine/20_TiltPlus_SensibilizacaoCutanea.pdf"; + //String fileName = "files/documine/SOLICITA_VICTRATO-GOLD-II_Item 21_Mutacao_Genica.pdf"; + //String fileName = "files/documine/Study Document 1 - Acute Eye IrritationCorrosion - Rabbits.pdf"; + //String fileName = "files/documine/ITEM 20_Sensibilização cutânea.pdf"; + //String fileName = "files/documine/VV-547523_LLNA.pdf"; //String fileName = "files/new/S-Metolachlor_RAR_01_Volume_1_2018-09-06.pdf"; //String fileName = "files/syngenta_190_deduplicated/1 Abamectin_prr.pdf"; //String fileName = "files/new/abschlussarbeiten-template-institut-fur-informatik-padagogische-hochschule-karlsruhe.pdf"; @@ -92,7 +95,7 @@ public class ViewerDocumentTest extends BuildDocumentTest { //String fileName = "files/new/18-Curacron_ToxicidadeOcularInVitro.pdf"; //String fileName = "files/new/mistitled_outlines_example.pdf"; //String fileName = "files/bdr/Plenarprotokoll 1 (keine Druchsache!) (1) 1.pdf"; - //String fileName = "files/new/UTT-Books-53.pdf"; + String fileName = "files/new/UTT-Books-53.pdf"; String tmpFileName = "/tmp/" + Path.of(fileName).getFileName() + "_VIEWER.pdf"; var documentFile = new ClassPathResource(fileName).getFile();