RED-7074: Design Subsection section tree structure algorithm

* lots of refactoring to splitting logic for text blocks which resulted in some empty blocks to be created which can then not be localized (i.e. by containsBlock)
This commit is contained in:
maverickstuder 2024-05-08 14:15:27 +02:00
parent a9338262c5
commit cfb6f0acfa
3 changed files with 137 additions and 13 deletions

View File

@ -122,11 +122,14 @@ public class TableExtractionService {
private boolean doesCellContainTextBlock(Cell cell, TextPageBlock textBlock) {
if(cell.isEmpty() || textBlock.getSequences().isEmpty()) {
return false;
}
double x = textBlock.getPdfMinX();
double y = textBlock.getPdfMinY();
double w = textBlock.getPdfMaxX() - textBlock.getPdfMinX();
double h = textBlock.getPdfMaxY() - textBlock.getPdfMinY();
if (cell.isEmpty() || w <= 0 || h <= 0) {
if (w <= 0 || h <= 0) {
return false;
}
double x0 = cell.getX();

View File

@ -20,6 +20,7 @@ import com.knecon.fforesight.service.layoutparser.processor.services.factory.Sea
import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations;
import lombok.Data;
import lombok.NonNull;
@Service
public class BlockificationPostprocessingService {
@ -119,7 +120,7 @@ public class BlockificationPostprocessingService {
double minDistance = Math.min(distanceToDirectMatch, Math.min(distanceToSplitCandidate, distanceToBestMergeCandidates));
if(minDistance == Double.MAX_VALUE) {
if (minDistance == Double.MAX_VALUE) {
return;
}
if (minDistance == distanceToDirectMatch) {
@ -140,9 +141,6 @@ public class BlockificationPostprocessingService {
List<TextPageBlock> otherBlocks = new ArrayList<>();
int blockToSplitIdx = classificationPage.getTextBlocks().indexOf(blockToSplit);
WordSequenceResult wordSequenceResult = findWordSequence(blockToSplit.getSequences(), text);
List<TextPositionSequence> postSequence = blockToSplit.getSequences();
postSequence.removeAll(wordSequenceResult.inSequence);
postSequence.removeAll(wordSequenceResult.preSequence);
blockToSplit.setSequences(wordSequenceResult.inSequence);
@ -152,8 +150,8 @@ public class BlockificationPostprocessingService {
otherBlocks.add(block);
blockToSplitIdx++;
}
if (!postSequence.isEmpty()) {
TextPageBlock block = buildTextBlock(postSequence, 0);
if (!wordSequenceResult.postSequence.isEmpty()) {
TextPageBlock block = buildTextBlock(wordSequenceResult.postSequence, 0);
classificationPage.getTextBlocks().add(blockToSplitIdx + 1, block);
otherBlocks.add(block);
}
@ -161,8 +159,129 @@ public class BlockificationPostprocessingService {
}
public static class WordSequenceResult {
public List<TextPositionSequence> inSequence;
public List<TextPositionSequence> preSequence;
public List<TextPositionSequence> postSequence;
public WordSequenceResult(List<TextPositionSequence> inSequence, List<TextPositionSequence> preSequence, List<TextPositionSequence> postSequence) {
this.inSequence = inSequence;
this.preSequence = preSequence;
this.postSequence = postSequence;
}
public WordSequenceResult() {
this.inSequence = new ArrayList<>();
this.preSequence = new ArrayList<>();
this.postSequence = new ArrayList<>();
}
}
private static WordSequenceResult findWordSequence(List<TextPositionSequence> textPositionSequences, String text) {
String target = sanitizeString(text);
List<TextPositionSequence> inSequence = new ArrayList<>();
List<TextPositionSequence> preSequence = new ArrayList<>();
List<TextPositionSequence> postSequence = new ArrayList<>();
StringBuilder currentSequence = new StringBuilder();
for (TextPositionSequence sequence : textPositionSequences) {
currentSequence.append(sanitizeString(sequence.toString()));
inSequence.add(sequence);
if (currentSequence.length() >= target.length()) {
if (currentSequence.toString().endsWith(target)) {
int index = 0;
String toRemove = currentSequence.substring(0, currentSequence.length() - target.length());
TextPositionSequence next = inSequence.get(index);
while (currentSequence.length() - next.length() >= target.length()) {
TextPositionSequence removed = inSequence.remove(index);
currentSequence.delete(0, removed.toString().length());
preSequence.add(removed);
next = inSequence.get(index);
toRemove = toRemove.substring(removed.length());
}
if (!toRemove.isEmpty()) {
SplitSequenceResult splitSequenceResult = splitSequence(inSequence.remove(index), toRemove);
currentSequence.delete(0, splitSequenceResult.out.length());
preSequence.add(splitSequenceResult.out);
inSequence.add(index, splitSequenceResult.in);
}
} else if (currentSequence.toString().startsWith(target)) {
int index = inSequence.size() - 1;
String toRemove = currentSequence.substring(target.length());
SplitSequenceResult splitSequenceResult = splitSequence(inSequence.remove(index), toRemove);
currentSequence.delete(currentSequence.length() - splitSequenceResult.out.length(), currentSequence.length());
inSequence.add(index, splitSequenceResult.in);
postSequence.add(splitSequenceResult.out);
}
if (currentSequence.toString().equals(target)) {
postSequence.addAll(textPositionSequences.subList(textPositionSequences.indexOf(sequence) + 1, textPositionSequences.size()));
return new WordSequenceResult(inSequence, preSequence, postSequence);
}
}
}
return new WordSequenceResult();
}
private static SplitSequenceResult splitSequence(TextPositionSequence sequence, String toRemove) {
TextPositionSequence in = null;
TextPositionSequence out;
String currentSequence = sequence.toString();
int index = currentSequence.indexOf(toRemove);
int endIndex = index + toRemove.length();
out = createSubSequence(sequence, index, endIndex);
if (index > 0) {
in = createSubSequence(sequence, 0, index);
} else if (endIndex < sequence.getTextPositions().size()) {
in = createSubSequence(sequence, endIndex, sequence.getTextPositions().size());
}
return new SplitSequenceResult(in, out);
}
public record SplitSequenceResult(TextPositionSequence in, TextPositionSequence out) {
}
private static TextPositionSequence createSubSequence(TextPositionSequence sequence, int start, int end) {
TextPositionSequence newSeq = new TextPositionSequence(new ArrayList<>(sequence.getTextPositions().subList(start, end)), sequence.getPage());
newSeq.setParagraphStart(sequence.isParagraphStart());
return newSeq;
}
private static WordSequenceResultOld findWordSequenceOld(List<TextPositionSequence> textPositionSequences, String text) {
String target = sanitizeString(text);
List<TextPositionSequence> inSequence = new ArrayList<>();
List<TextPositionSequence> preSequence = new ArrayList<>();
@ -186,10 +305,10 @@ public class BlockificationPostprocessingService {
}
if (currentSequence.toString().equals(target)) {
return new WordSequenceResult(inSequence, preSequence);
return new WordSequenceResultOld(inSequence, preSequence);
}
}
return new WordSequenceResult(new ArrayList<>(), new ArrayList<>());
return new WordSequenceResultOld(new ArrayList<>(), new ArrayList<>());
}
@ -258,7 +377,6 @@ public class BlockificationPostprocessingService {
}
// currently only three cases are handled here:
// 1. equality
// 2. outline title contains block text
@ -301,7 +419,7 @@ public class BlockificationPostprocessingService {
}
private record WordSequenceResult(List<TextPositionSequence> inSequence, List<TextPositionSequence> preSequence) {
private record WordSequenceResultOld(List<TextPositionSequence> inSequence, List<TextPositionSequence> preSequence) {
}

View File

@ -83,7 +83,10 @@ public class ViewerDocumentTest extends BuildDocumentTest {
public void testViewerDocument() {
String fileName = "files/documine/20_TiltPlus_SensibilizacaoCutanea.pdf";
//String fileName = "files/documine/SOLICITA_VICTRATO-GOLD-II_Item 21_Mutacao_Genica.pdf";
//String fileName = "files/documine/Study Document 1 - Acute Eye IrritationCorrosion - Rabbits.pdf";
//String fileName = "files/documine/ITEM 20_Sensibilização cutânea.pdf";
//String fileName = "files/documine/VV-547523_LLNA.pdf";
//String fileName = "files/new/S-Metolachlor_RAR_01_Volume_1_2018-09-06.pdf";
//String fileName = "files/syngenta_190_deduplicated/1 Abamectin_prr.pdf";
//String fileName = "files/new/abschlussarbeiten-template-institut-fur-informatik-padagogische-hochschule-karlsruhe.pdf";
@ -92,7 +95,7 @@ public class ViewerDocumentTest extends BuildDocumentTest {
//String fileName = "files/new/18-Curacron_ToxicidadeOcularInVitro.pdf";
//String fileName = "files/new/mistitled_outlines_example.pdf";
//String fileName = "files/bdr/Plenarprotokoll 1 (keine Druchsache!) (1) 1.pdf";
//String fileName = "files/new/UTT-Books-53.pdf";
String fileName = "files/new/UTT-Books-53.pdf";
String tmpFileName = "/tmp/" + Path.of(fileName).getFileName() + "_VIEWER.pdf";
var documentFile = new ClassPathResource(fileName).getFile();