RED-7074: Design Subsection section tree structure algorithm

* bugfix
This commit is contained in:
maverickstuder 2024-05-15 13:51:49 +02:00
parent 2d33615b94
commit 6a0661cf09
9 changed files with 69 additions and 57 deletions

View File

@ -16,7 +16,7 @@ public class OutlineObject {
private Point2D point; private Point2D point;
private final int treeDepth; private final int treeDepth;
private boolean found = false; private boolean found;
public OutlineObject(String title, int pageNumber, Point2D point2D, int depth) { public OutlineObject(String title, int pageNumber, Point2D point2D, int depth) {

View File

@ -97,7 +97,7 @@ public class TableOfContents implements Iterable<TableOfContentItem> {
private final Stack<Iterator<TableOfContentItem>> stack = new Stack<>(); private final Stack<Iterator<TableOfContentItem>> stack = new Stack<>();
public TableOfContentItemIterator(List<TableOfContentItem> mainSections) { TableOfContentItemIterator(List<TableOfContentItem> mainSections) {
stack.push(mainSections.iterator()); stack.push(mainSections.iterator());
} }

View File

@ -206,9 +206,11 @@ public class BlockificationPostprocessingService {
if (minDistance == distanceToDirectMatch) { if (minDistance == distanceToDirectMatch) {
directMatch.setClassification(headlineType); directMatch.setClassification(headlineType);
} else if (minDistance == distanceToSplitCandidate) { } else if (minDistance == distanceToSplitCandidate) {
List<TextPageBlock> others = splitBlock(classificationPage, splitCandidate, context.sectionIdentifier, outlineObject.getTitle()); SplitBlockResult splitBlockResult = splitBlock(classificationPage, splitCandidate, context.sectionIdentifier, outlineObject.getTitle());
splitCandidate.setClassification(headlineType); if (splitBlockResult.modifiedBlockToSplit) {
others.forEach(other -> other.setClassification(null)); splitCandidate.setClassification(headlineType);
}
splitBlockResult.otherBlocks.forEach(other -> other.setClassification(null));
} else { } else {
var merged = mergeBlocks(classificationPage, bestMergeCandidateCombination); var merged = mergeBlocks(classificationPage, bestMergeCandidateCombination);
merged.setClassification(headlineType); merged.setClassification(headlineType);
@ -217,7 +219,7 @@ public class BlockificationPostprocessingService {
} }
private List<TextPageBlock> splitBlock(ClassificationPage classificationPage, TextPageBlock blockToSplit, SectionIdentifier sectionIdentifier, String title) { private SplitBlockResult splitBlock(ClassificationPage classificationPage, TextPageBlock blockToSplit, SectionIdentifier sectionIdentifier, String title) {
List<TextPageBlock> otherBlocks = new ArrayList<>(); List<TextPageBlock> otherBlocks = new ArrayList<>();
int blockToSplitIdx = classificationPage.getTextBlocks().indexOf(blockToSplit); int blockToSplitIdx = classificationPage.getTextBlocks().indexOf(blockToSplit);
@ -228,12 +230,16 @@ public class BlockificationPostprocessingService {
} }
WordSequenceResult wordSequenceResult = findWordSequence(blockToSplit.getSequences(), headline); WordSequenceResult wordSequenceResult = findWordSequence(blockToSplit.getSequences(), headline);
if (wordSequenceResult.inSequence.isEmpty()) { if (wordSequenceResult.inSequence.isEmpty() && !headline.equals(title)) {
wordSequenceResult = findWordSequence(blockToSplit.getSequences(), title); wordSequenceResult = findWordSequence(blockToSplit.getSequences(), title);
} }
blockToSplit.setSequences(wordSequenceResult.inSequence); boolean modifiedBlockToSplit = false;
blockToSplit.resize(); if (!wordSequenceResult.inSequence.isEmpty()) {
blockToSplit.setSequences(wordSequenceResult.inSequence);
blockToSplit.resize();
modifiedBlockToSplit = true;
}
if (!wordSequenceResult.preSequence.isEmpty()) { if (!wordSequenceResult.preSequence.isEmpty()) {
TextPageBlock block = buildTextBlock(wordSequenceResult.preSequence, 0); TextPageBlock block = buildTextBlock(wordSequenceResult.preSequence, 0);
@ -246,32 +252,8 @@ public class BlockificationPostprocessingService {
classificationPage.getTextBlocks().add(blockToSplitIdx + 1, block); classificationPage.getTextBlocks().add(blockToSplitIdx + 1, block);
otherBlocks.add(block); otherBlocks.add(block);
} }
return otherBlocks;
}
public static class WordSequenceResult {
public List<TextPositionSequence> inSequence;
public List<TextPositionSequence> preSequence;
public List<TextPositionSequence> postSequence;
public WordSequenceResult(List<TextPositionSequence> inSequence, List<TextPositionSequence> preSequence, List<TextPositionSequence> postSequence) {
this.inSequence = inSequence;
this.preSequence = preSequence;
this.postSequence = postSequence;
}
public WordSequenceResult() {
this.inSequence = new ArrayList<>();
this.preSequence = new ArrayList<>();
this.postSequence = new ArrayList<>();
}
return new SplitBlockResult(modifiedBlockToSplit, otherBlocks);
} }
@ -358,11 +340,6 @@ public class BlockificationPostprocessingService {
} }
public record SplitSequenceResult(TextPositionSequence in, TextPositionSequence out) {
}
private static TextPositionSequence createSubSequence(TextPositionSequence sequence, int start, int end) { private static TextPositionSequence createSubSequence(TextPositionSequence sequence, int start, int end) {
TextPositionSequence newSeq = new TextPositionSequence(new ArrayList<>(sequence.getTextPositions().subList(start, end)), sequence.getPage()); TextPositionSequence newSeq = new TextPositionSequence(new ArrayList<>(sequence.getTextPositions().subList(start, end)), sequence.getPage());
@ -502,7 +479,7 @@ public class BlockificationPostprocessingService {
private SectionIdentifier sectionIdentifier; private SectionIdentifier sectionIdentifier;
public OutlineProcessionContext(OutlineObject outlineObject) { OutlineProcessionContext(OutlineObject outlineObject) {
this.outlineObject = outlineObject; this.outlineObject = outlineObject;
this.directMatch = null; this.directMatch = null;
@ -513,4 +490,36 @@ public class BlockificationPostprocessingService {
} }
public static class WordSequenceResult {
public List<TextPositionSequence> inSequence;
public List<TextPositionSequence> preSequence;
public List<TextPositionSequence> postSequence;
public WordSequenceResult(List<TextPositionSequence> inSequence, List<TextPositionSequence> preSequence, List<TextPositionSequence> postSequence) {
this.inSequence = inSequence;
this.preSequence = preSequence;
this.postSequence = postSequence;
}
public WordSequenceResult() {
this.inSequence = new ArrayList<>();
this.preSequence = new ArrayList<>();
this.postSequence = new ArrayList<>();
}
}
public record SplitBlockResult(boolean modifiedBlockToSplit, List<TextPageBlock> otherBlocks) {
}
public record SplitSequenceResult(TextPositionSequence in, TextPositionSequence out) {
}
} }

View File

@ -33,28 +33,29 @@ public class HeadlineClassificationService {
} }
public void classifyHeadline(TextPageBlock textBlock, PageBlockType headlineType) { public void classifyHeadline(TextPageBlock textBlock, PageBlockType initialHeadlineType) {
TextPageBlock lastHeadline = getLastHeadline(); TextPageBlock lastHeadline = getLastHeadline();
TextPageBlock lastHeadlineFromOutline = getLastHeadlineFromOutline(); TextPageBlock lastHeadlineFromOutline = getLastHeadlineFromOutline();
PageBlockType originalClassifiedBlockType = getOriginalClassifiedBlockType(); PageBlockType originalClassifiedBlockType = getOriginalClassifiedBlockType();
PageBlockType finalHeadlineType = initialHeadlineType;
if (lastHeadline != null) { if (lastHeadline != null) {
if (lastHeadline.equals(lastHeadlineFromOutline)) { if (lastHeadline.equals(lastHeadlineFromOutline)) {
headlineType = PageBlockType.getHeadlineType(getHeadlineNumber(lastHeadline.getClassification()) + 1); finalHeadlineType = PageBlockType.getHeadlineType(getHeadlineNumber(lastHeadline.getClassification()) + 1);
} else if (originalClassifiedBlockType != null && lastHeadline.getClassification() != originalClassifiedBlockType) { } else if (originalClassifiedBlockType != null && lastHeadline.getClassification() != originalClassifiedBlockType) {
PageBlockType lastHeadlineType = lastHeadline.getClassification(); PageBlockType lastHeadlineType = lastHeadline.getClassification();
int difference = getHeadlineNumber(originalClassifiedBlockType) - getHeadlineNumber(lastHeadlineType); int difference = getHeadlineNumber(originalClassifiedBlockType) - getHeadlineNumber(lastHeadlineType);
headlineType = PageBlockType.getHeadlineType(getHeadlineNumber(headlineType) + difference); finalHeadlineType = PageBlockType.getHeadlineType(getHeadlineNumber(initialHeadlineType) - difference);
} }
} }
setOriginalClassifiedBlockType(headlineType); setOriginalClassifiedBlockType(initialHeadlineType);
textBlock.setClassification(headlineType); textBlock.setClassification(finalHeadlineType);
setLastHeadline(textBlock); setLastHeadline(textBlock);
} }

View File

@ -150,7 +150,7 @@ public class TableNodeFactory {
cell.getTextBlocks() cell.getTextBlocks()
.stream() .stream()
.map(tb -> (AbstractPageBlock) tb) .map(tb -> (AbstractPageBlock) tb)
.toList(), .collect(Collectors.toList()),
emptyList(), emptyList(),
context, context,
document); document);

View File

@ -69,10 +69,10 @@ public class HeadlinesGoldStandardIntegrationTest {
public void testHeadlineDetection() { public void testHeadlineDetection() {
List<Metrics> metrics = new ArrayList<>(); List<Metrics> metrics = new ArrayList<>();
metrics.add(getMetrics("files/syngenta/CustomerFiles/01 - CGA100251 - Acute Oral Toxicity (Up and Down Procedure) - Rat (1).pdf", //metrics.add(getMetrics("files/syngenta/CustomerFiles/01 - CGA100251 - Acute Oral Toxicity (Up and Down Procedure) - Rat (1).pdf",
"files/headlineTest/01 - CGA100251 - Acute Oral Toxicity (Up and Down Procedure) - Rat (1)_REDACTION_LOG.json")); // "files/headlineTest/01 - CGA100251 - Acute Oral Toxicity (Up and Down Procedure) - Rat (1)_REDACTION_LOG.json"));
metrics.add(getMetrics("files/syngenta/CustomerFiles/91 Trinexapac-ethyl_RAR_01_Volume_1_2018-02-23.pdf", //metrics.add(getMetrics("files/syngenta/CustomerFiles/91 Trinexapac-ethyl_RAR_01_Volume_1_2018-02-23.pdf",
"files/headlineTest/91 Trinexapac-ethyl_RAR_01_Volume_1_2018-02-23_REDACTION_LOG.json")); // "files/headlineTest/91 Trinexapac-ethyl_RAR_01_Volume_1_2018-02-23_REDACTION_LOG.json"));
metrics.add(getMetrics("files/syngenta/CustomerFiles/S-Metolachlor_RAR_01_Volume_1_2018-09-06.pdf", "files/headlineTest/S-Metolachlor_RAR_01_Volume_1_2018-09-06_REDACTION_LOG.json")); metrics.add(getMetrics("files/syngenta/CustomerFiles/S-Metolachlor_RAR_01_Volume_1_2018-09-06.pdf", "files/headlineTest/S-Metolachlor_RAR_01_Volume_1_2018-09-06_REDACTION_LOG.json"));
double precision = metrics.stream().mapToDouble(Metrics::getPrecision).average().orElse(1.0); double precision = metrics.stream().mapToDouble(Metrics::getPrecision).average().orElse(1.0);
@ -96,8 +96,8 @@ public class HeadlinesGoldStandardIntegrationTest {
goldStandardLog.getRedactionLogEntry().removeIf(r -> !r.isRedacted() || r.getChanges().get(r.getChanges().size() - 1).getType().equals(ChangeType.REMOVED)); goldStandardLog.getRedactionLogEntry().removeIf(r -> !r.isRedacted() || r.getChanges().get(r.getChanges().size() - 1).getType().equals(ChangeType.REMOVED));
goldStandardLog.getRedactionLogEntry().forEach(e -> goldStandardHeadlines.add(new Headline(e.getPositions().get(0).getPage(), e.getValue()))); goldStandardLog.getRedactionLogEntry().forEach(e -> goldStandardHeadlines.add(new Headline(e.getPositions().get(0).getPage(), e.getValue())));
Document documentGraph = DocumentGraphFactory.buildDocumentGraph(LayoutParsingType.REDACT_MANAGER_OLD, Document documentGraph = DocumentGraphFactory.buildDocumentGraph(LayoutParsingType.REDACT_MANAGER,
layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER_OLD, layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER,
pdfFileResource.getFile(), pdfFileResource.getFile(),
new ImageServiceResponse(), new ImageServiceResponse(),
new TableServiceResponse(), new TableServiceResponse(),

View File

@ -27,7 +27,8 @@ public class ViewerDocumentTest extends BuildDocumentTest {
@SneakyThrows @SneakyThrows
public void testViewerDocument() { public void testViewerDocument() {
String fileName = "files/syngenta/CustomerFiles/S-Metolachlor_RAR_01_Volume_1_2018-09-06.pdf"; String fileName = "files/syngenta/CustomerFiles/90 Trinexapac-ethyl - Peer Review Report Syngenta - March 2018.pdf";
String tmpFileName = "/tmp/" + Path.of(fileName).getFileName() + "_VIEWER.pdf"; String tmpFileName = "/tmp/" + Path.of(fileName).getFileName() + "_VIEWER.pdf";
var documentFile = new ClassPathResource(fileName).getFile(); var documentFile = new ClassPathResource(fileName).getFile();

View File

@ -92,6 +92,7 @@ public class RulingCleaningServiceTest extends BuildDocumentTest {
.toList(); .toList();
for (String pdfFileName : pdfFileNames) { for (String pdfFileName : pdfFileNames) {
writeJsons(Path.of(pdfFileName)); writeJsons(Path.of(pdfFileName));
} }
} }
@ -100,15 +101,15 @@ public class RulingCleaningServiceTest extends BuildDocumentTest {
@SneakyThrows @SneakyThrows
private void writeJsons(Path filename) { private void writeJsons(Path filename) {
Document documentGraphBefore = DocumentGraphFactory.buildDocumentGraph(LayoutParsingType.REDACT_MANAGER_OLD, Document documentGraphBefore = DocumentGraphFactory.buildDocumentGraph(LayoutParsingType.REDACT_MANAGER,
layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER_OLD, layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER,
filename.toFile(), filename.toFile(),
new ImageServiceResponse(), new ImageServiceResponse(),
new TableServiceResponse(), new TableServiceResponse(),
new VisualLayoutParsingResponse(), new VisualLayoutParsingResponse(),
Map.of("file",filename.toFile().toString()))); Map.of("file",filename.toFile().toString())));
Document documentGraphAfter = DocumentGraphFactory.buildDocumentGraph(LayoutParsingType.REDACT_MANAGER_OLD, Document documentGraphAfter = DocumentGraphFactory.buildDocumentGraph(LayoutParsingType.REDACT_MANAGER,
layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER_OLD, layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER,
filename.toFile(), filename.toFile(),
new ImageServiceResponse(), new ImageServiceResponse(),
new TableServiceResponse(), new TableServiceResponse(),