RED-9206 - Sections are no longer correctly separated from each other in the test file
- introduce new layout parsing type: REDACT_MANAGER_WITHOUT_DUPLICATE_PARAGRAPH to include changes from REDACT_MANAGER apart from duplicate paragraph. - updated junit tests -
This commit is contained in:
parent
c3edeb3c7d
commit
fd698a78fc
@ -4,6 +4,7 @@ public enum LayoutParsingType {
|
||||
REDACT_MANAGER,
|
||||
REDACT_MANAGER_OLD,
|
||||
REDACT_MANAGER_PARAGRAPH_DEBUG,
|
||||
REDACT_MANAGER_WITHOUT_DUPLICATE_PARAGRAPH,
|
||||
DOCUMINE,
|
||||
DOCUMINE_OLD,
|
||||
CLARIFYND,
|
||||
|
||||
@ -51,8 +51,8 @@ import com.knecon.fforesight.service.layoutparser.processor.services.RulingClean
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.SectionsBuilderService;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.SimplifiedSectionTextService;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.TableExtractionService;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.blockification.BlockificationPostprocessingService;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.TextRulingsClassifier;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.blockification.BlockificationPostprocessingService;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.blockification.DocstrumBlockificationService;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.blockification.DocuMineBlockificationService;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.blockification.RedactManagerBlockificationService;
|
||||
@ -319,10 +319,16 @@ public class LayoutParsingPipeline {
|
||||
case REDACT_MANAGER_OLD ->
|
||||
redactManagerBlockificationService.blockify(stripper.getTextPositionSequences(), cleanRulings, classificationDocument.getVisualizations());
|
||||
case DOCUMINE_OLD -> docuMineBlockificationService.blockify(words, cleanRulings);
|
||||
case DOCUMINE, REDACT_MANAGER, REDACT_MANAGER_PARAGRAPH_DEBUG ->
|
||||
docstrumBlockificationService.blockify(words, cleanRulings, true, classificationDocument.getVisualizations(), layoutParsingType);
|
||||
case CLARIFYND, CLARIFYND_PARAGRAPH_DEBUG ->
|
||||
docstrumBlockificationService.blockify(words, cleanRulings, false, classificationDocument.getVisualizations(), layoutParsingType);
|
||||
case DOCUMINE, REDACT_MANAGER, REDACT_MANAGER_PARAGRAPH_DEBUG, REDACT_MANAGER_WITHOUT_DUPLICATE_PARAGRAPH -> docstrumBlockificationService.blockify(words,
|
||||
cleanRulings,
|
||||
true,
|
||||
classificationDocument.getVisualizations(),
|
||||
layoutParsingType);
|
||||
case CLARIFYND, CLARIFYND_PARAGRAPH_DEBUG -> docstrumBlockificationService.blockify(words,
|
||||
cleanRulings,
|
||||
false,
|
||||
classificationDocument.getVisualizations(),
|
||||
layoutParsingType);
|
||||
};
|
||||
|
||||
classificationPage.setCleanRulings(cleanRulings);
|
||||
@ -381,8 +387,8 @@ public class LayoutParsingPipeline {
|
||||
}
|
||||
log.info("Classify TextBlocks for {}", identifier);
|
||||
switch (layoutParsingType) {
|
||||
case REDACT_MANAGER, REDACT_MANAGER_PARAGRAPH_DEBUG, REDACT_MANAGER_OLD, CLARIFYND_PARAGRAPH_DEBUG ->
|
||||
redactManagerClassificationService.classifyDocument(classificationDocument);
|
||||
case REDACT_MANAGER, REDACT_MANAGER_PARAGRAPH_DEBUG, REDACT_MANAGER_OLD, CLARIFYND_PARAGRAPH_DEBUG, REDACT_MANAGER_WITHOUT_DUPLICATE_PARAGRAPH -> redactManagerClassificationService.classifyDocument(
|
||||
classificationDocument);
|
||||
case DOCUMINE_OLD, DOCUMINE -> docuMineClassificationService.classifyDocument(classificationDocument);
|
||||
case CLARIFYND -> clarifyndClassificationService.classifyDocument(classificationDocument);
|
||||
}
|
||||
|
||||
@ -58,8 +58,10 @@ public class DocstrumBlockificationService {
|
||||
|
||||
mergeIntersectingBlocks(classificationPage, usedRulings, 0, 0);
|
||||
|
||||
if (layoutParsingType == LayoutParsingType.DOCUMINE || layoutParsingType == LayoutParsingType.REDACT_MANAGER) {
|
||||
combineBlocks(classificationPage);
|
||||
if (layoutParsingType == LayoutParsingType.DOCUMINE
|
||||
|| layoutParsingType == LayoutParsingType.REDACT_MANAGER
|
||||
|| layoutParsingType == LayoutParsingType.REDACT_MANAGER_WITHOUT_DUPLICATE_PARAGRAPH) {
|
||||
combineBlocks(classificationPage, layoutParsingType);
|
||||
}
|
||||
|
||||
if (layoutParsingType == LayoutParsingType.CLARIFYND) {
|
||||
@ -105,7 +107,7 @@ public class DocstrumBlockificationService {
|
||||
}
|
||||
|
||||
|
||||
public void combineBlocks(ClassificationPage page) {
|
||||
public void combineBlocks(ClassificationPage page, LayoutParsingType layoutParsingType) {
|
||||
|
||||
TextPageBlock previous = new TextPageBlock();
|
||||
ListIterator<AbstractPageBlock> itty = page.getTextBlocks().listIterator();
|
||||
@ -137,7 +139,8 @@ public class DocstrumBlockificationService {
|
||||
}
|
||||
|
||||
if (areTheOnlyTwoBlocksOnHeightsWithBothMoreThanTwoLines(previous, current, page)) {
|
||||
previous = combineBlocksAndResetIterator(previous, current, itty, true);
|
||||
// previous = combineBlocksAndResetIterator(previous, current, itty, true);
|
||||
previous = combineBlocksAndResetIterator(previous, current, itty, layoutParsingType != LayoutParsingType.REDACT_MANAGER_WITHOUT_DUPLICATE_PARAGRAPH);
|
||||
continue;
|
||||
}
|
||||
|
||||
|
||||
@ -70,7 +70,7 @@ public class LayoutparserEnd2EndTest extends AbstractTest {
|
||||
file = new File(filePath);
|
||||
}
|
||||
|
||||
LayoutParsingRequest layoutParsingRequest = buildDefaultLayoutParsingRequest(fileName, LayoutParsingType.REDACT_MANAGER, true);
|
||||
LayoutParsingRequest layoutParsingRequest = buildDefaultLayoutParsingRequest(fileName, LayoutParsingType.REDACT_MANAGER_WITHOUT_DUPLICATE_PARAGRAPH, true);
|
||||
prepareStorage(layoutParsingRequest, file);
|
||||
|
||||
LayoutParsingFinishedEvent finishedEvent = layoutParsingPipeline.parseLayoutAndSaveFilesToStorage(layoutParsingRequest);
|
||||
|
||||
@ -56,7 +56,7 @@ public class OutlineDetectionTest extends AbstractTest {
|
||||
var documentFile = new ClassPathResource(fileName).getFile();
|
||||
|
||||
long start = System.currentTimeMillis();
|
||||
ClassificationDocument classificationDocument = parseLayout(fileName, LayoutParsingType.REDACT_MANAGER);
|
||||
ClassificationDocument classificationDocument = parseLayout(fileName, LayoutParsingType.REDACT_MANAGER_WITHOUT_DUPLICATE_PARAGRAPH);
|
||||
|
||||
OutlineObjectTree outlineObjectTree = classificationDocument.getOutlineObjectTree();
|
||||
assertEquals(outlineObjectTree.getRootNodes().size(), 8);
|
||||
|
||||
@ -64,8 +64,8 @@ public class SimplifiedTextServiceTest
|
||||
@SneakyThrows
|
||||
protected Document buildGraph(File file) {
|
||||
|
||||
return DocumentGraphFactory.buildDocumentGraph(LayoutParsingType.REDACT_MANAGER,
|
||||
layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER,
|
||||
return DocumentGraphFactory.buildDocumentGraph(LayoutParsingType.REDACT_MANAGER_WITHOUT_DUPLICATE_PARAGRAPH,
|
||||
layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER_WITHOUT_DUPLICATE_PARAGRAPH,
|
||||
file,
|
||||
new ImageServiceResponse(),
|
||||
new TableServiceResponse(),
|
||||
|
||||
@ -56,8 +56,8 @@ public class DocumentGraphJsonWritingTest extends BuildDocumentTest {
|
||||
@SneakyThrows
|
||||
private void writeJsons(Path filename) {
|
||||
|
||||
Document documentGraph = DocumentGraphFactory.buildDocumentGraph(LayoutParsingType.REDACT_MANAGER_OLD,
|
||||
layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER_OLD,
|
||||
Document documentGraph = DocumentGraphFactory.buildDocumentGraph(LayoutParsingType.REDACT_MANAGER_WITHOUT_DUPLICATE_PARAGRAPH,
|
||||
layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER_WITHOUT_DUPLICATE_PARAGRAPH,
|
||||
filename.toFile(),
|
||||
new ImageServiceResponse(),
|
||||
new TableServiceResponse(),
|
||||
|
||||
@ -38,7 +38,7 @@ public class ViewerDocumentTest extends BuildDocumentTest {
|
||||
var documentFile = new ClassPathResource(fileName).getFile();
|
||||
|
||||
long start = System.currentTimeMillis();
|
||||
Document document = buildGraph(fileName, LayoutParsingType.REDACT_MANAGER);
|
||||
Document document = buildGraph(fileName, LayoutParsingType.REDACT_MANAGER_WITHOUT_DUPLICATE_PARAGRAPH);
|
||||
layoutGridService.addLayoutGrid(documentFile, document, new File(tmpFileName), true, false);
|
||||
System.out.printf("Total time: %.2fs%n", ((float) (System.currentTimeMillis() - start)) / 1000);
|
||||
}
|
||||
|
||||
@ -54,7 +54,7 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
||||
@SneakyThrows
|
||||
public ClassificationDocument buildClassificationDocument(File originDocument, TableServiceResponse tableServiceResponse) {
|
||||
|
||||
return layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER,
|
||||
return layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER_WITHOUT_DUPLICATE_PARAGRAPH,
|
||||
originDocument,
|
||||
new ImageServiceResponse(),
|
||||
tableServiceResponse,
|
||||
@ -122,7 +122,7 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
||||
.get(0).getTextBlocks()
|
||||
.get(0).toString()).contains(textToSearch);
|
||||
|
||||
Document document = DocumentGraphFactory.buildDocumentGraph(LayoutParsingType.REDACT_MANAGER, classificationDocument);
|
||||
Document document = DocumentGraphFactory.buildDocumentGraph(LayoutParsingType.REDACT_MANAGER_WITHOUT_DUPLICATE_PARAGRAPH, classificationDocument);
|
||||
|
||||
TextBlock leafTextBlock = document.getFirstPage().getHeader().getLeafTextBlock();
|
||||
assertThat(leafTextBlock.getSearchText().contains(textToSearch)).isTrue();
|
||||
|
||||
@ -103,15 +103,15 @@ public class RulingCleaningServiceTest extends BuildDocumentTest {
|
||||
@SneakyThrows
|
||||
private void writeJsons(Path filename) {
|
||||
|
||||
Document documentGraphBefore = DocumentGraphFactory.buildDocumentGraph(LayoutParsingType.REDACT_MANAGER,
|
||||
layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER,
|
||||
Document documentGraphBefore = DocumentGraphFactory.buildDocumentGraph(LayoutParsingType.REDACT_MANAGER_WITHOUT_DUPLICATE_PARAGRAPH,
|
||||
layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER_WITHOUT_DUPLICATE_PARAGRAPH,
|
||||
filename.toFile(),
|
||||
new ImageServiceResponse(),
|
||||
new TableServiceResponse(),
|
||||
new VisualLayoutParsingResponse(),
|
||||
Map.of("file",filename.toFile().toString())));
|
||||
Document documentGraphAfter = DocumentGraphFactory.buildDocumentGraph(LayoutParsingType.REDACT_MANAGER,
|
||||
layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER,
|
||||
Document documentGraphAfter = DocumentGraphFactory.buildDocumentGraph(LayoutParsingType.REDACT_MANAGER_WITHOUT_DUPLICATE_PARAGRAPH,
|
||||
layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER_WITHOUT_DUPLICATE_PARAGRAPH,
|
||||
filename.toFile(),
|
||||
new ImageServiceResponse(),
|
||||
new TableServiceResponse(),
|
||||
|
||||
@ -50,7 +50,7 @@ public abstract class BuildDocumentTest extends AbstractTest {
|
||||
|
||||
|
||||
if (!filename.startsWith("files") && filename.startsWith("/")) {
|
||||
LayoutParsingRequest layoutParsingRequest = buildDefaultLayoutParsingRequest(Path.of(filename).getFileName().toString(), LayoutParsingType.REDACT_MANAGER, true);
|
||||
LayoutParsingRequest layoutParsingRequest = buildDefaultLayoutParsingRequest(Path.of(filename).getFileName().toString(), LayoutParsingType.REDACT_MANAGER_WITHOUT_DUPLICATE_PARAGRAPH, true);
|
||||
prepareStorage(layoutParsingRequest, new File(filename));
|
||||
return DocumentGraphFactory.buildDocumentGraph(layoutParsingType,
|
||||
layoutParsingPipeline.parseLayout(layoutParsingType,
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user