RED-7074: Design Subsection section tree structure algorithm
* bugfix
This commit is contained in:
parent
2d33615b94
commit
6a0661cf09
@ -16,7 +16,7 @@ public class OutlineObject {
|
|||||||
private Point2D point;
|
private Point2D point;
|
||||||
private final int treeDepth;
|
private final int treeDepth;
|
||||||
|
|
||||||
private boolean found = false;
|
private boolean found;
|
||||||
|
|
||||||
|
|
||||||
public OutlineObject(String title, int pageNumber, Point2D point2D, int depth) {
|
public OutlineObject(String title, int pageNumber, Point2D point2D, int depth) {
|
||||||
|
|||||||
@ -97,7 +97,7 @@ public class TableOfContents implements Iterable<TableOfContentItem> {
|
|||||||
private final Stack<Iterator<TableOfContentItem>> stack = new Stack<>();
|
private final Stack<Iterator<TableOfContentItem>> stack = new Stack<>();
|
||||||
|
|
||||||
|
|
||||||
public TableOfContentItemIterator(List<TableOfContentItem> mainSections) {
|
TableOfContentItemIterator(List<TableOfContentItem> mainSections) {
|
||||||
|
|
||||||
stack.push(mainSections.iterator());
|
stack.push(mainSections.iterator());
|
||||||
}
|
}
|
||||||
|
|||||||
@ -206,9 +206,11 @@ public class BlockificationPostprocessingService {
|
|||||||
if (minDistance == distanceToDirectMatch) {
|
if (minDistance == distanceToDirectMatch) {
|
||||||
directMatch.setClassification(headlineType);
|
directMatch.setClassification(headlineType);
|
||||||
} else if (minDistance == distanceToSplitCandidate) {
|
} else if (minDistance == distanceToSplitCandidate) {
|
||||||
List<TextPageBlock> others = splitBlock(classificationPage, splitCandidate, context.sectionIdentifier, outlineObject.getTitle());
|
SplitBlockResult splitBlockResult = splitBlock(classificationPage, splitCandidate, context.sectionIdentifier, outlineObject.getTitle());
|
||||||
splitCandidate.setClassification(headlineType);
|
if (splitBlockResult.modifiedBlockToSplit) {
|
||||||
others.forEach(other -> other.setClassification(null));
|
splitCandidate.setClassification(headlineType);
|
||||||
|
}
|
||||||
|
splitBlockResult.otherBlocks.forEach(other -> other.setClassification(null));
|
||||||
} else {
|
} else {
|
||||||
var merged = mergeBlocks(classificationPage, bestMergeCandidateCombination);
|
var merged = mergeBlocks(classificationPage, bestMergeCandidateCombination);
|
||||||
merged.setClassification(headlineType);
|
merged.setClassification(headlineType);
|
||||||
@ -217,7 +219,7 @@ public class BlockificationPostprocessingService {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
private List<TextPageBlock> splitBlock(ClassificationPage classificationPage, TextPageBlock blockToSplit, SectionIdentifier sectionIdentifier, String title) {
|
private SplitBlockResult splitBlock(ClassificationPage classificationPage, TextPageBlock blockToSplit, SectionIdentifier sectionIdentifier, String title) {
|
||||||
|
|
||||||
List<TextPageBlock> otherBlocks = new ArrayList<>();
|
List<TextPageBlock> otherBlocks = new ArrayList<>();
|
||||||
int blockToSplitIdx = classificationPage.getTextBlocks().indexOf(blockToSplit);
|
int blockToSplitIdx = classificationPage.getTextBlocks().indexOf(blockToSplit);
|
||||||
@ -228,12 +230,16 @@ public class BlockificationPostprocessingService {
|
|||||||
}
|
}
|
||||||
|
|
||||||
WordSequenceResult wordSequenceResult = findWordSequence(blockToSplit.getSequences(), headline);
|
WordSequenceResult wordSequenceResult = findWordSequence(blockToSplit.getSequences(), headline);
|
||||||
if (wordSequenceResult.inSequence.isEmpty()) {
|
if (wordSequenceResult.inSequence.isEmpty() && !headline.equals(title)) {
|
||||||
wordSequenceResult = findWordSequence(blockToSplit.getSequences(), title);
|
wordSequenceResult = findWordSequence(blockToSplit.getSequences(), title);
|
||||||
}
|
}
|
||||||
|
|
||||||
blockToSplit.setSequences(wordSequenceResult.inSequence);
|
boolean modifiedBlockToSplit = false;
|
||||||
blockToSplit.resize();
|
if (!wordSequenceResult.inSequence.isEmpty()) {
|
||||||
|
blockToSplit.setSequences(wordSequenceResult.inSequence);
|
||||||
|
blockToSplit.resize();
|
||||||
|
modifiedBlockToSplit = true;
|
||||||
|
}
|
||||||
|
|
||||||
if (!wordSequenceResult.preSequence.isEmpty()) {
|
if (!wordSequenceResult.preSequence.isEmpty()) {
|
||||||
TextPageBlock block = buildTextBlock(wordSequenceResult.preSequence, 0);
|
TextPageBlock block = buildTextBlock(wordSequenceResult.preSequence, 0);
|
||||||
@ -246,32 +252,8 @@ public class BlockificationPostprocessingService {
|
|||||||
classificationPage.getTextBlocks().add(blockToSplitIdx + 1, block);
|
classificationPage.getTextBlocks().add(blockToSplitIdx + 1, block);
|
||||||
otherBlocks.add(block);
|
otherBlocks.add(block);
|
||||||
}
|
}
|
||||||
return otherBlocks;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
public static class WordSequenceResult {
|
|
||||||
|
|
||||||
public List<TextPositionSequence> inSequence;
|
|
||||||
public List<TextPositionSequence> preSequence;
|
|
||||||
public List<TextPositionSequence> postSequence;
|
|
||||||
|
|
||||||
|
|
||||||
public WordSequenceResult(List<TextPositionSequence> inSequence, List<TextPositionSequence> preSequence, List<TextPositionSequence> postSequence) {
|
|
||||||
|
|
||||||
this.inSequence = inSequence;
|
|
||||||
this.preSequence = preSequence;
|
|
||||||
this.postSequence = postSequence;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
public WordSequenceResult() {
|
|
||||||
|
|
||||||
this.inSequence = new ArrayList<>();
|
|
||||||
this.preSequence = new ArrayList<>();
|
|
||||||
this.postSequence = new ArrayList<>();
|
|
||||||
}
|
|
||||||
|
|
||||||
|
return new SplitBlockResult(modifiedBlockToSplit, otherBlocks);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@ -358,11 +340,6 @@ public class BlockificationPostprocessingService {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
public record SplitSequenceResult(TextPositionSequence in, TextPositionSequence out) {
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
private static TextPositionSequence createSubSequence(TextPositionSequence sequence, int start, int end) {
|
private static TextPositionSequence createSubSequence(TextPositionSequence sequence, int start, int end) {
|
||||||
|
|
||||||
TextPositionSequence newSeq = new TextPositionSequence(new ArrayList<>(sequence.getTextPositions().subList(start, end)), sequence.getPage());
|
TextPositionSequence newSeq = new TextPositionSequence(new ArrayList<>(sequence.getTextPositions().subList(start, end)), sequence.getPage());
|
||||||
@ -502,7 +479,7 @@ public class BlockificationPostprocessingService {
|
|||||||
private SectionIdentifier sectionIdentifier;
|
private SectionIdentifier sectionIdentifier;
|
||||||
|
|
||||||
|
|
||||||
public OutlineProcessionContext(OutlineObject outlineObject) {
|
OutlineProcessionContext(OutlineObject outlineObject) {
|
||||||
|
|
||||||
this.outlineObject = outlineObject;
|
this.outlineObject = outlineObject;
|
||||||
this.directMatch = null;
|
this.directMatch = null;
|
||||||
@ -513,4 +490,36 @@ public class BlockificationPostprocessingService {
|
|||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public static class WordSequenceResult {
|
||||||
|
|
||||||
|
public List<TextPositionSequence> inSequence;
|
||||||
|
public List<TextPositionSequence> preSequence;
|
||||||
|
public List<TextPositionSequence> postSequence;
|
||||||
|
|
||||||
|
|
||||||
|
public WordSequenceResult(List<TextPositionSequence> inSequence, List<TextPositionSequence> preSequence, List<TextPositionSequence> postSequence) {
|
||||||
|
|
||||||
|
this.inSequence = inSequence;
|
||||||
|
this.preSequence = preSequence;
|
||||||
|
this.postSequence = postSequence;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public WordSequenceResult() {
|
||||||
|
|
||||||
|
this.inSequence = new ArrayList<>();
|
||||||
|
this.preSequence = new ArrayList<>();
|
||||||
|
this.postSequence = new ArrayList<>();
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
public record SplitBlockResult(boolean modifiedBlockToSplit, List<TextPageBlock> otherBlocks) {
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
public record SplitSequenceResult(TextPositionSequence in, TextPositionSequence out) {
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|||||||
@ -33,28 +33,29 @@ public class HeadlineClassificationService {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
public void classifyHeadline(TextPageBlock textBlock, PageBlockType headlineType) {
|
public void classifyHeadline(TextPageBlock textBlock, PageBlockType initialHeadlineType) {
|
||||||
|
|
||||||
TextPageBlock lastHeadline = getLastHeadline();
|
TextPageBlock lastHeadline = getLastHeadline();
|
||||||
TextPageBlock lastHeadlineFromOutline = getLastHeadlineFromOutline();
|
TextPageBlock lastHeadlineFromOutline = getLastHeadlineFromOutline();
|
||||||
PageBlockType originalClassifiedBlockType = getOriginalClassifiedBlockType();
|
PageBlockType originalClassifiedBlockType = getOriginalClassifiedBlockType();
|
||||||
|
PageBlockType finalHeadlineType = initialHeadlineType;
|
||||||
|
|
||||||
if (lastHeadline != null) {
|
if (lastHeadline != null) {
|
||||||
|
|
||||||
if (lastHeadline.equals(lastHeadlineFromOutline)) {
|
if (lastHeadline.equals(lastHeadlineFromOutline)) {
|
||||||
|
|
||||||
headlineType = PageBlockType.getHeadlineType(getHeadlineNumber(lastHeadline.getClassification()) + 1);
|
finalHeadlineType = PageBlockType.getHeadlineType(getHeadlineNumber(lastHeadline.getClassification()) + 1);
|
||||||
|
|
||||||
} else if (originalClassifiedBlockType != null && lastHeadline.getClassification() != originalClassifiedBlockType) {
|
} else if (originalClassifiedBlockType != null && lastHeadline.getClassification() != originalClassifiedBlockType) {
|
||||||
|
|
||||||
PageBlockType lastHeadlineType = lastHeadline.getClassification();
|
PageBlockType lastHeadlineType = lastHeadline.getClassification();
|
||||||
int difference = getHeadlineNumber(originalClassifiedBlockType) - getHeadlineNumber(lastHeadlineType);
|
int difference = getHeadlineNumber(originalClassifiedBlockType) - getHeadlineNumber(lastHeadlineType);
|
||||||
headlineType = PageBlockType.getHeadlineType(getHeadlineNumber(headlineType) + difference);
|
finalHeadlineType = PageBlockType.getHeadlineType(getHeadlineNumber(initialHeadlineType) - difference);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
setOriginalClassifiedBlockType(headlineType);
|
setOriginalClassifiedBlockType(initialHeadlineType);
|
||||||
textBlock.setClassification(headlineType);
|
textBlock.setClassification(finalHeadlineType);
|
||||||
setLastHeadline(textBlock);
|
setLastHeadline(textBlock);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@ -150,7 +150,7 @@ public class TableNodeFactory {
|
|||||||
cell.getTextBlocks()
|
cell.getTextBlocks()
|
||||||
.stream()
|
.stream()
|
||||||
.map(tb -> (AbstractPageBlock) tb)
|
.map(tb -> (AbstractPageBlock) tb)
|
||||||
.toList(),
|
.collect(Collectors.toList()),
|
||||||
emptyList(),
|
emptyList(),
|
||||||
context,
|
context,
|
||||||
document);
|
document);
|
||||||
|
|||||||
@ -69,10 +69,10 @@ public class HeadlinesGoldStandardIntegrationTest {
|
|||||||
public void testHeadlineDetection() {
|
public void testHeadlineDetection() {
|
||||||
|
|
||||||
List<Metrics> metrics = new ArrayList<>();
|
List<Metrics> metrics = new ArrayList<>();
|
||||||
metrics.add(getMetrics("files/syngenta/CustomerFiles/01 - CGA100251 - Acute Oral Toxicity (Up and Down Procedure) - Rat (1).pdf",
|
//metrics.add(getMetrics("files/syngenta/CustomerFiles/01 - CGA100251 - Acute Oral Toxicity (Up and Down Procedure) - Rat (1).pdf",
|
||||||
"files/headlineTest/01 - CGA100251 - Acute Oral Toxicity (Up and Down Procedure) - Rat (1)_REDACTION_LOG.json"));
|
// "files/headlineTest/01 - CGA100251 - Acute Oral Toxicity (Up and Down Procedure) - Rat (1)_REDACTION_LOG.json"));
|
||||||
metrics.add(getMetrics("files/syngenta/CustomerFiles/91 Trinexapac-ethyl_RAR_01_Volume_1_2018-02-23.pdf",
|
//metrics.add(getMetrics("files/syngenta/CustomerFiles/91 Trinexapac-ethyl_RAR_01_Volume_1_2018-02-23.pdf",
|
||||||
"files/headlineTest/91 Trinexapac-ethyl_RAR_01_Volume_1_2018-02-23_REDACTION_LOG.json"));
|
// "files/headlineTest/91 Trinexapac-ethyl_RAR_01_Volume_1_2018-02-23_REDACTION_LOG.json"));
|
||||||
metrics.add(getMetrics("files/syngenta/CustomerFiles/S-Metolachlor_RAR_01_Volume_1_2018-09-06.pdf", "files/headlineTest/S-Metolachlor_RAR_01_Volume_1_2018-09-06_REDACTION_LOG.json"));
|
metrics.add(getMetrics("files/syngenta/CustomerFiles/S-Metolachlor_RAR_01_Volume_1_2018-09-06.pdf", "files/headlineTest/S-Metolachlor_RAR_01_Volume_1_2018-09-06_REDACTION_LOG.json"));
|
||||||
|
|
||||||
double precision = metrics.stream().mapToDouble(Metrics::getPrecision).average().orElse(1.0);
|
double precision = metrics.stream().mapToDouble(Metrics::getPrecision).average().orElse(1.0);
|
||||||
@ -96,8 +96,8 @@ public class HeadlinesGoldStandardIntegrationTest {
|
|||||||
goldStandardLog.getRedactionLogEntry().removeIf(r -> !r.isRedacted() || r.getChanges().get(r.getChanges().size() - 1).getType().equals(ChangeType.REMOVED));
|
goldStandardLog.getRedactionLogEntry().removeIf(r -> !r.isRedacted() || r.getChanges().get(r.getChanges().size() - 1).getType().equals(ChangeType.REMOVED));
|
||||||
goldStandardLog.getRedactionLogEntry().forEach(e -> goldStandardHeadlines.add(new Headline(e.getPositions().get(0).getPage(), e.getValue())));
|
goldStandardLog.getRedactionLogEntry().forEach(e -> goldStandardHeadlines.add(new Headline(e.getPositions().get(0).getPage(), e.getValue())));
|
||||||
|
|
||||||
Document documentGraph = DocumentGraphFactory.buildDocumentGraph(LayoutParsingType.REDACT_MANAGER_OLD,
|
Document documentGraph = DocumentGraphFactory.buildDocumentGraph(LayoutParsingType.REDACT_MANAGER,
|
||||||
layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER_OLD,
|
layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER,
|
||||||
pdfFileResource.getFile(),
|
pdfFileResource.getFile(),
|
||||||
new ImageServiceResponse(),
|
new ImageServiceResponse(),
|
||||||
new TableServiceResponse(),
|
new TableServiceResponse(),
|
||||||
|
|||||||
@ -27,7 +27,8 @@ public class ViewerDocumentTest extends BuildDocumentTest {
|
|||||||
@SneakyThrows
|
@SneakyThrows
|
||||||
public void testViewerDocument() {
|
public void testViewerDocument() {
|
||||||
|
|
||||||
String fileName = "files/syngenta/CustomerFiles/S-Metolachlor_RAR_01_Volume_1_2018-09-06.pdf";
|
String fileName = "files/syngenta/CustomerFiles/90 Trinexapac-ethyl - Peer Review Report Syngenta - March 2018.pdf";
|
||||||
|
|
||||||
String tmpFileName = "/tmp/" + Path.of(fileName).getFileName() + "_VIEWER.pdf";
|
String tmpFileName = "/tmp/" + Path.of(fileName).getFileName() + "_VIEWER.pdf";
|
||||||
|
|
||||||
var documentFile = new ClassPathResource(fileName).getFile();
|
var documentFile = new ClassPathResource(fileName).getFile();
|
||||||
|
|||||||
@ -92,6 +92,7 @@ public class RulingCleaningServiceTest extends BuildDocumentTest {
|
|||||||
.toList();
|
.toList();
|
||||||
|
|
||||||
for (String pdfFileName : pdfFileNames) {
|
for (String pdfFileName : pdfFileNames) {
|
||||||
|
|
||||||
writeJsons(Path.of(pdfFileName));
|
writeJsons(Path.of(pdfFileName));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -100,15 +101,15 @@ public class RulingCleaningServiceTest extends BuildDocumentTest {
|
|||||||
@SneakyThrows
|
@SneakyThrows
|
||||||
private void writeJsons(Path filename) {
|
private void writeJsons(Path filename) {
|
||||||
|
|
||||||
Document documentGraphBefore = DocumentGraphFactory.buildDocumentGraph(LayoutParsingType.REDACT_MANAGER_OLD,
|
Document documentGraphBefore = DocumentGraphFactory.buildDocumentGraph(LayoutParsingType.REDACT_MANAGER,
|
||||||
layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER_OLD,
|
layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER,
|
||||||
filename.toFile(),
|
filename.toFile(),
|
||||||
new ImageServiceResponse(),
|
new ImageServiceResponse(),
|
||||||
new TableServiceResponse(),
|
new TableServiceResponse(),
|
||||||
new VisualLayoutParsingResponse(),
|
new VisualLayoutParsingResponse(),
|
||||||
Map.of("file",filename.toFile().toString())));
|
Map.of("file",filename.toFile().toString())));
|
||||||
Document documentGraphAfter = DocumentGraphFactory.buildDocumentGraph(LayoutParsingType.REDACT_MANAGER_OLD,
|
Document documentGraphAfter = DocumentGraphFactory.buildDocumentGraph(LayoutParsingType.REDACT_MANAGER,
|
||||||
layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER_OLD,
|
layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER,
|
||||||
filename.toFile(),
|
filename.toFile(),
|
||||||
new ImageServiceResponse(),
|
new ImageServiceResponse(),
|
||||||
new TableServiceResponse(),
|
new TableServiceResponse(),
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user