RED-7074: Design Subsection section tree structure algorithm

* bugfix
This commit is contained in:
maverickstuder 2024-05-15 13:51:49 +02:00
parent 2d33615b94
commit 6a0661cf09
9 changed files with 69 additions and 57 deletions

View File

@ -16,7 +16,7 @@ public class OutlineObject {
private Point2D point;
private final int treeDepth;
private boolean found = false;
private boolean found;
public OutlineObject(String title, int pageNumber, Point2D point2D, int depth) {

View File

@ -97,7 +97,7 @@ public class TableOfContents implements Iterable<TableOfContentItem> {
private final Stack<Iterator<TableOfContentItem>> stack = new Stack<>();
public TableOfContentItemIterator(List<TableOfContentItem> mainSections) {
TableOfContentItemIterator(List<TableOfContentItem> mainSections) {
stack.push(mainSections.iterator());
}

View File

@ -206,9 +206,11 @@ public class BlockificationPostprocessingService {
if (minDistance == distanceToDirectMatch) {
directMatch.setClassification(headlineType);
} else if (minDistance == distanceToSplitCandidate) {
List<TextPageBlock> others = splitBlock(classificationPage, splitCandidate, context.sectionIdentifier, outlineObject.getTitle());
splitCandidate.setClassification(headlineType);
others.forEach(other -> other.setClassification(null));
SplitBlockResult splitBlockResult = splitBlock(classificationPage, splitCandidate, context.sectionIdentifier, outlineObject.getTitle());
if (splitBlockResult.modifiedBlockToSplit) {
splitCandidate.setClassification(headlineType);
}
splitBlockResult.otherBlocks.forEach(other -> other.setClassification(null));
} else {
var merged = mergeBlocks(classificationPage, bestMergeCandidateCombination);
merged.setClassification(headlineType);
@ -217,7 +219,7 @@ public class BlockificationPostprocessingService {
}
private List<TextPageBlock> splitBlock(ClassificationPage classificationPage, TextPageBlock blockToSplit, SectionIdentifier sectionIdentifier, String title) {
private SplitBlockResult splitBlock(ClassificationPage classificationPage, TextPageBlock blockToSplit, SectionIdentifier sectionIdentifier, String title) {
List<TextPageBlock> otherBlocks = new ArrayList<>();
int blockToSplitIdx = classificationPage.getTextBlocks().indexOf(blockToSplit);
@ -228,12 +230,16 @@ public class BlockificationPostprocessingService {
}
WordSequenceResult wordSequenceResult = findWordSequence(blockToSplit.getSequences(), headline);
if (wordSequenceResult.inSequence.isEmpty()) {
if (wordSequenceResult.inSequence.isEmpty() && !headline.equals(title)) {
wordSequenceResult = findWordSequence(blockToSplit.getSequences(), title);
}
blockToSplit.setSequences(wordSequenceResult.inSequence);
blockToSplit.resize();
boolean modifiedBlockToSplit = false;
if (!wordSequenceResult.inSequence.isEmpty()) {
blockToSplit.setSequences(wordSequenceResult.inSequence);
blockToSplit.resize();
modifiedBlockToSplit = true;
}
if (!wordSequenceResult.preSequence.isEmpty()) {
TextPageBlock block = buildTextBlock(wordSequenceResult.preSequence, 0);
@ -246,32 +252,8 @@ public class BlockificationPostprocessingService {
classificationPage.getTextBlocks().add(blockToSplitIdx + 1, block);
otherBlocks.add(block);
}
return otherBlocks;
}
public static class WordSequenceResult {
public List<TextPositionSequence> inSequence;
public List<TextPositionSequence> preSequence;
public List<TextPositionSequence> postSequence;
public WordSequenceResult(List<TextPositionSequence> inSequence, List<TextPositionSequence> preSequence, List<TextPositionSequence> postSequence) {
this.inSequence = inSequence;
this.preSequence = preSequence;
this.postSequence = postSequence;
}
public WordSequenceResult() {
this.inSequence = new ArrayList<>();
this.preSequence = new ArrayList<>();
this.postSequence = new ArrayList<>();
}
return new SplitBlockResult(modifiedBlockToSplit, otherBlocks);
}
@ -358,11 +340,6 @@ public class BlockificationPostprocessingService {
}
public record SplitSequenceResult(TextPositionSequence in, TextPositionSequence out) {
}
private static TextPositionSequence createSubSequence(TextPositionSequence sequence, int start, int end) {
TextPositionSequence newSeq = new TextPositionSequence(new ArrayList<>(sequence.getTextPositions().subList(start, end)), sequence.getPage());
@ -502,7 +479,7 @@ public class BlockificationPostprocessingService {
private SectionIdentifier sectionIdentifier;
public OutlineProcessionContext(OutlineObject outlineObject) {
OutlineProcessionContext(OutlineObject outlineObject) {
this.outlineObject = outlineObject;
this.directMatch = null;
@ -513,4 +490,36 @@ public class BlockificationPostprocessingService {
}
public static class WordSequenceResult {
public List<TextPositionSequence> inSequence;
public List<TextPositionSequence> preSequence;
public List<TextPositionSequence> postSequence;
public WordSequenceResult(List<TextPositionSequence> inSequence, List<TextPositionSequence> preSequence, List<TextPositionSequence> postSequence) {
this.inSequence = inSequence;
this.preSequence = preSequence;
this.postSequence = postSequence;
}
public WordSequenceResult() {
this.inSequence = new ArrayList<>();
this.preSequence = new ArrayList<>();
this.postSequence = new ArrayList<>();
}
}
public record SplitBlockResult(boolean modifiedBlockToSplit, List<TextPageBlock> otherBlocks) {
}
public record SplitSequenceResult(TextPositionSequence in, TextPositionSequence out) {
}
}

View File

@ -33,28 +33,29 @@ public class HeadlineClassificationService {
}
public void classifyHeadline(TextPageBlock textBlock, PageBlockType headlineType) {
public void classifyHeadline(TextPageBlock textBlock, PageBlockType initialHeadlineType) {
TextPageBlock lastHeadline = getLastHeadline();
TextPageBlock lastHeadlineFromOutline = getLastHeadlineFromOutline();
PageBlockType originalClassifiedBlockType = getOriginalClassifiedBlockType();
PageBlockType finalHeadlineType = initialHeadlineType;
if (lastHeadline != null) {
if (lastHeadline.equals(lastHeadlineFromOutline)) {
headlineType = PageBlockType.getHeadlineType(getHeadlineNumber(lastHeadline.getClassification()) + 1);
finalHeadlineType = PageBlockType.getHeadlineType(getHeadlineNumber(lastHeadline.getClassification()) + 1);
} else if (originalClassifiedBlockType != null && lastHeadline.getClassification() != originalClassifiedBlockType) {
PageBlockType lastHeadlineType = lastHeadline.getClassification();
int difference = getHeadlineNumber(originalClassifiedBlockType) - getHeadlineNumber(lastHeadlineType);
headlineType = PageBlockType.getHeadlineType(getHeadlineNumber(headlineType) + difference);
finalHeadlineType = PageBlockType.getHeadlineType(getHeadlineNumber(initialHeadlineType) - difference);
}
}
setOriginalClassifiedBlockType(headlineType);
textBlock.setClassification(headlineType);
setOriginalClassifiedBlockType(initialHeadlineType);
textBlock.setClassification(finalHeadlineType);
setLastHeadline(textBlock);
}

View File

@ -150,7 +150,7 @@ public class TableNodeFactory {
cell.getTextBlocks()
.stream()
.map(tb -> (AbstractPageBlock) tb)
.toList(),
.collect(Collectors.toList()),
emptyList(),
context,
document);

View File

@ -69,10 +69,10 @@ public class HeadlinesGoldStandardIntegrationTest {
public void testHeadlineDetection() {
List<Metrics> metrics = new ArrayList<>();
metrics.add(getMetrics("files/syngenta/CustomerFiles/01 - CGA100251 - Acute Oral Toxicity (Up and Down Procedure) - Rat (1).pdf",
"files/headlineTest/01 - CGA100251 - Acute Oral Toxicity (Up and Down Procedure) - Rat (1)_REDACTION_LOG.json"));
metrics.add(getMetrics("files/syngenta/CustomerFiles/91 Trinexapac-ethyl_RAR_01_Volume_1_2018-02-23.pdf",
"files/headlineTest/91 Trinexapac-ethyl_RAR_01_Volume_1_2018-02-23_REDACTION_LOG.json"));
//metrics.add(getMetrics("files/syngenta/CustomerFiles/01 - CGA100251 - Acute Oral Toxicity (Up and Down Procedure) - Rat (1).pdf",
// "files/headlineTest/01 - CGA100251 - Acute Oral Toxicity (Up and Down Procedure) - Rat (1)_REDACTION_LOG.json"));
//metrics.add(getMetrics("files/syngenta/CustomerFiles/91 Trinexapac-ethyl_RAR_01_Volume_1_2018-02-23.pdf",
// "files/headlineTest/91 Trinexapac-ethyl_RAR_01_Volume_1_2018-02-23_REDACTION_LOG.json"));
metrics.add(getMetrics("files/syngenta/CustomerFiles/S-Metolachlor_RAR_01_Volume_1_2018-09-06.pdf", "files/headlineTest/S-Metolachlor_RAR_01_Volume_1_2018-09-06_REDACTION_LOG.json"));
double precision = metrics.stream().mapToDouble(Metrics::getPrecision).average().orElse(1.0);
@ -96,8 +96,8 @@ public class HeadlinesGoldStandardIntegrationTest {
goldStandardLog.getRedactionLogEntry().removeIf(r -> !r.isRedacted() || r.getChanges().get(r.getChanges().size() - 1).getType().equals(ChangeType.REMOVED));
goldStandardLog.getRedactionLogEntry().forEach(e -> goldStandardHeadlines.add(new Headline(e.getPositions().get(0).getPage(), e.getValue())));
Document documentGraph = DocumentGraphFactory.buildDocumentGraph(LayoutParsingType.REDACT_MANAGER_OLD,
layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER_OLD,
Document documentGraph = DocumentGraphFactory.buildDocumentGraph(LayoutParsingType.REDACT_MANAGER,
layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER,
pdfFileResource.getFile(),
new ImageServiceResponse(),
new TableServiceResponse(),

View File

@ -27,7 +27,8 @@ public class ViewerDocumentTest extends BuildDocumentTest {
@SneakyThrows
public void testViewerDocument() {
String fileName = "files/syngenta/CustomerFiles/S-Metolachlor_RAR_01_Volume_1_2018-09-06.pdf";
String fileName = "files/syngenta/CustomerFiles/90 Trinexapac-ethyl - Peer Review Report Syngenta - March 2018.pdf";
String tmpFileName = "/tmp/" + Path.of(fileName).getFileName() + "_VIEWER.pdf";
var documentFile = new ClassPathResource(fileName).getFile();

View File

@ -92,6 +92,7 @@ public class RulingCleaningServiceTest extends BuildDocumentTest {
.toList();
for (String pdfFileName : pdfFileNames) {
writeJsons(Path.of(pdfFileName));
}
}
@ -100,15 +101,15 @@ public class RulingCleaningServiceTest extends BuildDocumentTest {
@SneakyThrows
private void writeJsons(Path filename) {
Document documentGraphBefore = DocumentGraphFactory.buildDocumentGraph(LayoutParsingType.REDACT_MANAGER_OLD,
layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER_OLD,
Document documentGraphBefore = DocumentGraphFactory.buildDocumentGraph(LayoutParsingType.REDACT_MANAGER,
layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER,
filename.toFile(),
new ImageServiceResponse(),
new TableServiceResponse(),
new VisualLayoutParsingResponse(),
Map.of("file",filename.toFile().toString())));
Document documentGraphAfter = DocumentGraphFactory.buildDocumentGraph(LayoutParsingType.REDACT_MANAGER_OLD,
layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER_OLD,
Document documentGraphAfter = DocumentGraphFactory.buildDocumentGraph(LayoutParsingType.REDACT_MANAGER,
layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER,
filename.toFile(),
new ImageServiceResponse(),
new TableServiceResponse(),