RED-7074: Design Subsection section tree structure algorithm
* lots of refactoring to splitting logic for text blocks which resulted in some empty blocks to be created which can then not be localized (i.e. by containsBlock)
This commit is contained in:
parent
a9338262c5
commit
cfb6f0acfa
@ -122,11 +122,14 @@ public class TableExtractionService {
|
||||
|
||||
private boolean doesCellContainTextBlock(Cell cell, TextPageBlock textBlock) {
|
||||
|
||||
if(cell.isEmpty() || textBlock.getSequences().isEmpty()) {
|
||||
return false;
|
||||
}
|
||||
double x = textBlock.getPdfMinX();
|
||||
double y = textBlock.getPdfMinY();
|
||||
double w = textBlock.getPdfMaxX() - textBlock.getPdfMinX();
|
||||
double h = textBlock.getPdfMaxY() - textBlock.getPdfMinY();
|
||||
if (cell.isEmpty() || w <= 0 || h <= 0) {
|
||||
if (w <= 0 || h <= 0) {
|
||||
return false;
|
||||
}
|
||||
double x0 = cell.getX();
|
||||
|
||||
@ -20,6 +20,7 @@ import com.knecon.fforesight.service.layoutparser.processor.services.factory.Sea
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations;
|
||||
|
||||
import lombok.Data;
|
||||
import lombok.NonNull;
|
||||
|
||||
@Service
|
||||
public class BlockificationPostprocessingService {
|
||||
@ -119,7 +120,7 @@ public class BlockificationPostprocessingService {
|
||||
|
||||
double minDistance = Math.min(distanceToDirectMatch, Math.min(distanceToSplitCandidate, distanceToBestMergeCandidates));
|
||||
|
||||
if(minDistance == Double.MAX_VALUE) {
|
||||
if (minDistance == Double.MAX_VALUE) {
|
||||
return;
|
||||
}
|
||||
if (minDistance == distanceToDirectMatch) {
|
||||
@ -140,9 +141,6 @@ public class BlockificationPostprocessingService {
|
||||
List<TextPageBlock> otherBlocks = new ArrayList<>();
|
||||
int blockToSplitIdx = classificationPage.getTextBlocks().indexOf(blockToSplit);
|
||||
WordSequenceResult wordSequenceResult = findWordSequence(blockToSplit.getSequences(), text);
|
||||
List<TextPositionSequence> postSequence = blockToSplit.getSequences();
|
||||
postSequence.removeAll(wordSequenceResult.inSequence);
|
||||
postSequence.removeAll(wordSequenceResult.preSequence);
|
||||
|
||||
blockToSplit.setSequences(wordSequenceResult.inSequence);
|
||||
|
||||
@ -152,8 +150,8 @@ public class BlockificationPostprocessingService {
|
||||
otherBlocks.add(block);
|
||||
blockToSplitIdx++;
|
||||
}
|
||||
if (!postSequence.isEmpty()) {
|
||||
TextPageBlock block = buildTextBlock(postSequence, 0);
|
||||
if (!wordSequenceResult.postSequence.isEmpty()) {
|
||||
TextPageBlock block = buildTextBlock(wordSequenceResult.postSequence, 0);
|
||||
classificationPage.getTextBlocks().add(blockToSplitIdx + 1, block);
|
||||
otherBlocks.add(block);
|
||||
}
|
||||
@ -161,8 +159,129 @@ public class BlockificationPostprocessingService {
|
||||
}
|
||||
|
||||
|
||||
public static class WordSequenceResult {
|
||||
|
||||
public List<TextPositionSequence> inSequence;
|
||||
public List<TextPositionSequence> preSequence;
|
||||
public List<TextPositionSequence> postSequence;
|
||||
|
||||
|
||||
public WordSequenceResult(List<TextPositionSequence> inSequence, List<TextPositionSequence> preSequence, List<TextPositionSequence> postSequence) {
|
||||
|
||||
this.inSequence = inSequence;
|
||||
this.preSequence = preSequence;
|
||||
this.postSequence = postSequence;
|
||||
}
|
||||
|
||||
|
||||
public WordSequenceResult() {
|
||||
|
||||
this.inSequence = new ArrayList<>();
|
||||
this.preSequence = new ArrayList<>();
|
||||
this.postSequence = new ArrayList<>();
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
private static WordSequenceResult findWordSequence(List<TextPositionSequence> textPositionSequences, String text) {
|
||||
|
||||
String target = sanitizeString(text);
|
||||
List<TextPositionSequence> inSequence = new ArrayList<>();
|
||||
List<TextPositionSequence> preSequence = new ArrayList<>();
|
||||
List<TextPositionSequence> postSequence = new ArrayList<>();
|
||||
StringBuilder currentSequence = new StringBuilder();
|
||||
|
||||
for (TextPositionSequence sequence : textPositionSequences) {
|
||||
|
||||
currentSequence.append(sanitizeString(sequence.toString()));
|
||||
inSequence.add(sequence);
|
||||
|
||||
if (currentSequence.length() >= target.length()) {
|
||||
|
||||
if (currentSequence.toString().endsWith(target)) {
|
||||
|
||||
int index = 0;
|
||||
String toRemove = currentSequence.substring(0, currentSequence.length() - target.length());
|
||||
|
||||
TextPositionSequence next = inSequence.get(index);
|
||||
while (currentSequence.length() - next.length() >= target.length()) {
|
||||
|
||||
TextPositionSequence removed = inSequence.remove(index);
|
||||
currentSequence.delete(0, removed.toString().length());
|
||||
preSequence.add(removed);
|
||||
|
||||
next = inSequence.get(index);
|
||||
toRemove = toRemove.substring(removed.length());
|
||||
}
|
||||
|
||||
if (!toRemove.isEmpty()) {
|
||||
SplitSequenceResult splitSequenceResult = splitSequence(inSequence.remove(index), toRemove);
|
||||
|
||||
currentSequence.delete(0, splitSequenceResult.out.length());
|
||||
preSequence.add(splitSequenceResult.out);
|
||||
inSequence.add(index, splitSequenceResult.in);
|
||||
}
|
||||
|
||||
} else if (currentSequence.toString().startsWith(target)) {
|
||||
|
||||
int index = inSequence.size() - 1;
|
||||
String toRemove = currentSequence.substring(target.length());
|
||||
|
||||
SplitSequenceResult splitSequenceResult = splitSequence(inSequence.remove(index), toRemove);
|
||||
currentSequence.delete(currentSequence.length() - splitSequenceResult.out.length(), currentSequence.length());
|
||||
|
||||
inSequence.add(index, splitSequenceResult.in);
|
||||
postSequence.add(splitSequenceResult.out);
|
||||
}
|
||||
|
||||
if (currentSequence.toString().equals(target)) {
|
||||
postSequence.addAll(textPositionSequences.subList(textPositionSequences.indexOf(sequence) + 1, textPositionSequences.size()));
|
||||
return new WordSequenceResult(inSequence, preSequence, postSequence);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return new WordSequenceResult();
|
||||
}
|
||||
|
||||
|
||||
private static SplitSequenceResult splitSequence(TextPositionSequence sequence, String toRemove) {
|
||||
|
||||
TextPositionSequence in = null;
|
||||
TextPositionSequence out;
|
||||
|
||||
String currentSequence = sequence.toString();
|
||||
int index = currentSequence.indexOf(toRemove);
|
||||
int endIndex = index + toRemove.length();
|
||||
|
||||
out = createSubSequence(sequence, index, endIndex);
|
||||
|
||||
if (index > 0) {
|
||||
in = createSubSequence(sequence, 0, index);
|
||||
} else if (endIndex < sequence.getTextPositions().size()) {
|
||||
in = createSubSequence(sequence, endIndex, sequence.getTextPositions().size());
|
||||
}
|
||||
|
||||
return new SplitSequenceResult(in, out);
|
||||
}
|
||||
|
||||
|
||||
public record SplitSequenceResult(TextPositionSequence in, TextPositionSequence out) {
|
||||
|
||||
}
|
||||
|
||||
|
||||
private static TextPositionSequence createSubSequence(TextPositionSequence sequence, int start, int end) {
|
||||
|
||||
TextPositionSequence newSeq = new TextPositionSequence(new ArrayList<>(sequence.getTextPositions().subList(start, end)), sequence.getPage());
|
||||
newSeq.setParagraphStart(sequence.isParagraphStart());
|
||||
return newSeq;
|
||||
}
|
||||
|
||||
|
||||
private static WordSequenceResultOld findWordSequenceOld(List<TextPositionSequence> textPositionSequences, String text) {
|
||||
|
||||
String target = sanitizeString(text);
|
||||
List<TextPositionSequence> inSequence = new ArrayList<>();
|
||||
List<TextPositionSequence> preSequence = new ArrayList<>();
|
||||
@ -186,10 +305,10 @@ public class BlockificationPostprocessingService {
|
||||
}
|
||||
|
||||
if (currentSequence.toString().equals(target)) {
|
||||
return new WordSequenceResult(inSequence, preSequence);
|
||||
return new WordSequenceResultOld(inSequence, preSequence);
|
||||
}
|
||||
}
|
||||
return new WordSequenceResult(new ArrayList<>(), new ArrayList<>());
|
||||
return new WordSequenceResultOld(new ArrayList<>(), new ArrayList<>());
|
||||
}
|
||||
|
||||
|
||||
@ -258,7 +377,6 @@ public class BlockificationPostprocessingService {
|
||||
}
|
||||
|
||||
|
||||
|
||||
// currently only three cases are handled here:
|
||||
// 1. equality
|
||||
// 2. outline title contains block text
|
||||
@ -301,7 +419,7 @@ public class BlockificationPostprocessingService {
|
||||
}
|
||||
|
||||
|
||||
private record WordSequenceResult(List<TextPositionSequence> inSequence, List<TextPositionSequence> preSequence) {
|
||||
private record WordSequenceResultOld(List<TextPositionSequence> inSequence, List<TextPositionSequence> preSequence) {
|
||||
|
||||
}
|
||||
|
||||
|
||||
@ -83,7 +83,10 @@ public class ViewerDocumentTest extends BuildDocumentTest {
|
||||
public void testViewerDocument() {
|
||||
|
||||
|
||||
String fileName = "files/documine/20_TiltPlus_SensibilizacaoCutanea.pdf";
|
||||
//String fileName = "files/documine/SOLICITA_VICTRATO-GOLD-II_Item 21_Mutacao_Genica.pdf";
|
||||
//String fileName = "files/documine/Study Document 1 - Acute Eye IrritationCorrosion - Rabbits.pdf";
|
||||
//String fileName = "files/documine/ITEM 20_Sensibilização cutânea.pdf";
|
||||
//String fileName = "files/documine/VV-547523_LLNA.pdf";
|
||||
//String fileName = "files/new/S-Metolachlor_RAR_01_Volume_1_2018-09-06.pdf";
|
||||
//String fileName = "files/syngenta_190_deduplicated/1 Abamectin_prr.pdf";
|
||||
//String fileName = "files/new/abschlussarbeiten-template-institut-fur-informatik-padagogische-hochschule-karlsruhe.pdf";
|
||||
@ -92,7 +95,7 @@ public class ViewerDocumentTest extends BuildDocumentTest {
|
||||
//String fileName = "files/new/18-Curacron_ToxicidadeOcularInVitro.pdf";
|
||||
//String fileName = "files/new/mistitled_outlines_example.pdf";
|
||||
//String fileName = "files/bdr/Plenarprotokoll 1 (keine Druchsache!) (1) 1.pdf";
|
||||
//String fileName = "files/new/UTT-Books-53.pdf";
|
||||
String fileName = "files/new/UTT-Books-53.pdf";
|
||||
String tmpFileName = "/tmp/" + Path.of(fileName).getFileName() + "_VIEWER.pdf";
|
||||
|
||||
var documentFile = new ClassPathResource(fileName).getFile();
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user