RED-7074: Design Subsection section tree structure algorithm
* bugfix
This commit is contained in:
parent
2d33615b94
commit
6a0661cf09
@ -16,7 +16,7 @@ public class OutlineObject {
|
||||
private Point2D point;
|
||||
private final int treeDepth;
|
||||
|
||||
private boolean found = false;
|
||||
private boolean found;
|
||||
|
||||
|
||||
public OutlineObject(String title, int pageNumber, Point2D point2D, int depth) {
|
||||
|
||||
@ -97,7 +97,7 @@ public class TableOfContents implements Iterable<TableOfContentItem> {
|
||||
private final Stack<Iterator<TableOfContentItem>> stack = new Stack<>();
|
||||
|
||||
|
||||
public TableOfContentItemIterator(List<TableOfContentItem> mainSections) {
|
||||
TableOfContentItemIterator(List<TableOfContentItem> mainSections) {
|
||||
|
||||
stack.push(mainSections.iterator());
|
||||
}
|
||||
|
||||
@ -206,9 +206,11 @@ public class BlockificationPostprocessingService {
|
||||
if (minDistance == distanceToDirectMatch) {
|
||||
directMatch.setClassification(headlineType);
|
||||
} else if (minDistance == distanceToSplitCandidate) {
|
||||
List<TextPageBlock> others = splitBlock(classificationPage, splitCandidate, context.sectionIdentifier, outlineObject.getTitle());
|
||||
splitCandidate.setClassification(headlineType);
|
||||
others.forEach(other -> other.setClassification(null));
|
||||
SplitBlockResult splitBlockResult = splitBlock(classificationPage, splitCandidate, context.sectionIdentifier, outlineObject.getTitle());
|
||||
if (splitBlockResult.modifiedBlockToSplit) {
|
||||
splitCandidate.setClassification(headlineType);
|
||||
}
|
||||
splitBlockResult.otherBlocks.forEach(other -> other.setClassification(null));
|
||||
} else {
|
||||
var merged = mergeBlocks(classificationPage, bestMergeCandidateCombination);
|
||||
merged.setClassification(headlineType);
|
||||
@ -217,7 +219,7 @@ public class BlockificationPostprocessingService {
|
||||
}
|
||||
|
||||
|
||||
private List<TextPageBlock> splitBlock(ClassificationPage classificationPage, TextPageBlock blockToSplit, SectionIdentifier sectionIdentifier, String title) {
|
||||
private SplitBlockResult splitBlock(ClassificationPage classificationPage, TextPageBlock blockToSplit, SectionIdentifier sectionIdentifier, String title) {
|
||||
|
||||
List<TextPageBlock> otherBlocks = new ArrayList<>();
|
||||
int blockToSplitIdx = classificationPage.getTextBlocks().indexOf(blockToSplit);
|
||||
@ -228,12 +230,16 @@ public class BlockificationPostprocessingService {
|
||||
}
|
||||
|
||||
WordSequenceResult wordSequenceResult = findWordSequence(blockToSplit.getSequences(), headline);
|
||||
if (wordSequenceResult.inSequence.isEmpty()) {
|
||||
if (wordSequenceResult.inSequence.isEmpty() && !headline.equals(title)) {
|
||||
wordSequenceResult = findWordSequence(blockToSplit.getSequences(), title);
|
||||
}
|
||||
|
||||
blockToSplit.setSequences(wordSequenceResult.inSequence);
|
||||
blockToSplit.resize();
|
||||
boolean modifiedBlockToSplit = false;
|
||||
if (!wordSequenceResult.inSequence.isEmpty()) {
|
||||
blockToSplit.setSequences(wordSequenceResult.inSequence);
|
||||
blockToSplit.resize();
|
||||
modifiedBlockToSplit = true;
|
||||
}
|
||||
|
||||
if (!wordSequenceResult.preSequence.isEmpty()) {
|
||||
TextPageBlock block = buildTextBlock(wordSequenceResult.preSequence, 0);
|
||||
@ -246,32 +252,8 @@ public class BlockificationPostprocessingService {
|
||||
classificationPage.getTextBlocks().add(blockToSplitIdx + 1, block);
|
||||
otherBlocks.add(block);
|
||||
}
|
||||
return otherBlocks;
|
||||
}
|
||||
|
||||
|
||||
public static class WordSequenceResult {
|
||||
|
||||
public List<TextPositionSequence> inSequence;
|
||||
public List<TextPositionSequence> preSequence;
|
||||
public List<TextPositionSequence> postSequence;
|
||||
|
||||
|
||||
public WordSequenceResult(List<TextPositionSequence> inSequence, List<TextPositionSequence> preSequence, List<TextPositionSequence> postSequence) {
|
||||
|
||||
this.inSequence = inSequence;
|
||||
this.preSequence = preSequence;
|
||||
this.postSequence = postSequence;
|
||||
}
|
||||
|
||||
|
||||
public WordSequenceResult() {
|
||||
|
||||
this.inSequence = new ArrayList<>();
|
||||
this.preSequence = new ArrayList<>();
|
||||
this.postSequence = new ArrayList<>();
|
||||
}
|
||||
|
||||
return new SplitBlockResult(modifiedBlockToSplit, otherBlocks);
|
||||
}
|
||||
|
||||
|
||||
@ -358,11 +340,6 @@ public class BlockificationPostprocessingService {
|
||||
}
|
||||
|
||||
|
||||
public record SplitSequenceResult(TextPositionSequence in, TextPositionSequence out) {
|
||||
|
||||
}
|
||||
|
||||
|
||||
private static TextPositionSequence createSubSequence(TextPositionSequence sequence, int start, int end) {
|
||||
|
||||
TextPositionSequence newSeq = new TextPositionSequence(new ArrayList<>(sequence.getTextPositions().subList(start, end)), sequence.getPage());
|
||||
@ -502,7 +479,7 @@ public class BlockificationPostprocessingService {
|
||||
private SectionIdentifier sectionIdentifier;
|
||||
|
||||
|
||||
public OutlineProcessionContext(OutlineObject outlineObject) {
|
||||
OutlineProcessionContext(OutlineObject outlineObject) {
|
||||
|
||||
this.outlineObject = outlineObject;
|
||||
this.directMatch = null;
|
||||
@ -513,4 +490,36 @@ public class BlockificationPostprocessingService {
|
||||
|
||||
}
|
||||
|
||||
public static class WordSequenceResult {
|
||||
|
||||
public List<TextPositionSequence> inSequence;
|
||||
public List<TextPositionSequence> preSequence;
|
||||
public List<TextPositionSequence> postSequence;
|
||||
|
||||
|
||||
public WordSequenceResult(List<TextPositionSequence> inSequence, List<TextPositionSequence> preSequence, List<TextPositionSequence> postSequence) {
|
||||
|
||||
this.inSequence = inSequence;
|
||||
this.preSequence = preSequence;
|
||||
this.postSequence = postSequence;
|
||||
}
|
||||
|
||||
|
||||
public WordSequenceResult() {
|
||||
|
||||
this.inSequence = new ArrayList<>();
|
||||
this.preSequence = new ArrayList<>();
|
||||
this.postSequence = new ArrayList<>();
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
public record SplitBlockResult(boolean modifiedBlockToSplit, List<TextPageBlock> otherBlocks) {
|
||||
|
||||
}
|
||||
|
||||
public record SplitSequenceResult(TextPositionSequence in, TextPositionSequence out) {
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@ -33,28 +33,29 @@ public class HeadlineClassificationService {
|
||||
}
|
||||
|
||||
|
||||
public void classifyHeadline(TextPageBlock textBlock, PageBlockType headlineType) {
|
||||
public void classifyHeadline(TextPageBlock textBlock, PageBlockType initialHeadlineType) {
|
||||
|
||||
TextPageBlock lastHeadline = getLastHeadline();
|
||||
TextPageBlock lastHeadlineFromOutline = getLastHeadlineFromOutline();
|
||||
PageBlockType originalClassifiedBlockType = getOriginalClassifiedBlockType();
|
||||
PageBlockType finalHeadlineType = initialHeadlineType;
|
||||
|
||||
if (lastHeadline != null) {
|
||||
|
||||
if (lastHeadline.equals(lastHeadlineFromOutline)) {
|
||||
|
||||
headlineType = PageBlockType.getHeadlineType(getHeadlineNumber(lastHeadline.getClassification()) + 1);
|
||||
finalHeadlineType = PageBlockType.getHeadlineType(getHeadlineNumber(lastHeadline.getClassification()) + 1);
|
||||
|
||||
} else if (originalClassifiedBlockType != null && lastHeadline.getClassification() != originalClassifiedBlockType) {
|
||||
|
||||
PageBlockType lastHeadlineType = lastHeadline.getClassification();
|
||||
int difference = getHeadlineNumber(originalClassifiedBlockType) - getHeadlineNumber(lastHeadlineType);
|
||||
headlineType = PageBlockType.getHeadlineType(getHeadlineNumber(headlineType) + difference);
|
||||
finalHeadlineType = PageBlockType.getHeadlineType(getHeadlineNumber(initialHeadlineType) - difference);
|
||||
}
|
||||
}
|
||||
|
||||
setOriginalClassifiedBlockType(headlineType);
|
||||
textBlock.setClassification(headlineType);
|
||||
setOriginalClassifiedBlockType(initialHeadlineType);
|
||||
textBlock.setClassification(finalHeadlineType);
|
||||
setLastHeadline(textBlock);
|
||||
}
|
||||
|
||||
|
||||
@ -150,7 +150,7 @@ public class TableNodeFactory {
|
||||
cell.getTextBlocks()
|
||||
.stream()
|
||||
.map(tb -> (AbstractPageBlock) tb)
|
||||
.toList(),
|
||||
.collect(Collectors.toList()),
|
||||
emptyList(),
|
||||
context,
|
||||
document);
|
||||
|
||||
@ -69,10 +69,10 @@ public class HeadlinesGoldStandardIntegrationTest {
|
||||
public void testHeadlineDetection() {
|
||||
|
||||
List<Metrics> metrics = new ArrayList<>();
|
||||
metrics.add(getMetrics("files/syngenta/CustomerFiles/01 - CGA100251 - Acute Oral Toxicity (Up and Down Procedure) - Rat (1).pdf",
|
||||
"files/headlineTest/01 - CGA100251 - Acute Oral Toxicity (Up and Down Procedure) - Rat (1)_REDACTION_LOG.json"));
|
||||
metrics.add(getMetrics("files/syngenta/CustomerFiles/91 Trinexapac-ethyl_RAR_01_Volume_1_2018-02-23.pdf",
|
||||
"files/headlineTest/91 Trinexapac-ethyl_RAR_01_Volume_1_2018-02-23_REDACTION_LOG.json"));
|
||||
//metrics.add(getMetrics("files/syngenta/CustomerFiles/01 - CGA100251 - Acute Oral Toxicity (Up and Down Procedure) - Rat (1).pdf",
|
||||
// "files/headlineTest/01 - CGA100251 - Acute Oral Toxicity (Up and Down Procedure) - Rat (1)_REDACTION_LOG.json"));
|
||||
//metrics.add(getMetrics("files/syngenta/CustomerFiles/91 Trinexapac-ethyl_RAR_01_Volume_1_2018-02-23.pdf",
|
||||
// "files/headlineTest/91 Trinexapac-ethyl_RAR_01_Volume_1_2018-02-23_REDACTION_LOG.json"));
|
||||
metrics.add(getMetrics("files/syngenta/CustomerFiles/S-Metolachlor_RAR_01_Volume_1_2018-09-06.pdf", "files/headlineTest/S-Metolachlor_RAR_01_Volume_1_2018-09-06_REDACTION_LOG.json"));
|
||||
|
||||
double precision = metrics.stream().mapToDouble(Metrics::getPrecision).average().orElse(1.0);
|
||||
@ -96,8 +96,8 @@ public class HeadlinesGoldStandardIntegrationTest {
|
||||
goldStandardLog.getRedactionLogEntry().removeIf(r -> !r.isRedacted() || r.getChanges().get(r.getChanges().size() - 1).getType().equals(ChangeType.REMOVED));
|
||||
goldStandardLog.getRedactionLogEntry().forEach(e -> goldStandardHeadlines.add(new Headline(e.getPositions().get(0).getPage(), e.getValue())));
|
||||
|
||||
Document documentGraph = DocumentGraphFactory.buildDocumentGraph(LayoutParsingType.REDACT_MANAGER_OLD,
|
||||
layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER_OLD,
|
||||
Document documentGraph = DocumentGraphFactory.buildDocumentGraph(LayoutParsingType.REDACT_MANAGER,
|
||||
layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER,
|
||||
pdfFileResource.getFile(),
|
||||
new ImageServiceResponse(),
|
||||
new TableServiceResponse(),
|
||||
|
||||
@ -27,7 +27,8 @@ public class ViewerDocumentTest extends BuildDocumentTest {
|
||||
@SneakyThrows
|
||||
public void testViewerDocument() {
|
||||
|
||||
String fileName = "files/syngenta/CustomerFiles/S-Metolachlor_RAR_01_Volume_1_2018-09-06.pdf";
|
||||
String fileName = "files/syngenta/CustomerFiles/90 Trinexapac-ethyl - Peer Review Report Syngenta - March 2018.pdf";
|
||||
|
||||
String tmpFileName = "/tmp/" + Path.of(fileName).getFileName() + "_VIEWER.pdf";
|
||||
|
||||
var documentFile = new ClassPathResource(fileName).getFile();
|
||||
|
||||
@ -92,6 +92,7 @@ public class RulingCleaningServiceTest extends BuildDocumentTest {
|
||||
.toList();
|
||||
|
||||
for (String pdfFileName : pdfFileNames) {
|
||||
|
||||
writeJsons(Path.of(pdfFileName));
|
||||
}
|
||||
}
|
||||
@ -100,15 +101,15 @@ public class RulingCleaningServiceTest extends BuildDocumentTest {
|
||||
@SneakyThrows
|
||||
private void writeJsons(Path filename) {
|
||||
|
||||
Document documentGraphBefore = DocumentGraphFactory.buildDocumentGraph(LayoutParsingType.REDACT_MANAGER_OLD,
|
||||
layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER_OLD,
|
||||
Document documentGraphBefore = DocumentGraphFactory.buildDocumentGraph(LayoutParsingType.REDACT_MANAGER,
|
||||
layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER,
|
||||
filename.toFile(),
|
||||
new ImageServiceResponse(),
|
||||
new TableServiceResponse(),
|
||||
new VisualLayoutParsingResponse(),
|
||||
Map.of("file",filename.toFile().toString())));
|
||||
Document documentGraphAfter = DocumentGraphFactory.buildDocumentGraph(LayoutParsingType.REDACT_MANAGER_OLD,
|
||||
layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER_OLD,
|
||||
Document documentGraphAfter = DocumentGraphFactory.buildDocumentGraph(LayoutParsingType.REDACT_MANAGER,
|
||||
layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER,
|
||||
filename.toFile(),
|
||||
new ImageServiceResponse(),
|
||||
new TableServiceResponse(),
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user