Merge branch 'RED-9206' into 'main'

RED-9206 - Sections are no longer correctly separated from each other in the test file

See merge request fforesight/layout-parser!163
This commit is contained in:
Corina Olariu 2024-06-05 13:13:45 +02:00
commit b5cfa7b63d
11 changed files with 35 additions and 25 deletions

View File

@ -4,6 +4,7 @@ public enum LayoutParsingType {
REDACT_MANAGER,
REDACT_MANAGER_OLD,
REDACT_MANAGER_PARAGRAPH_DEBUG,
REDACT_MANAGER_WITHOUT_DUPLICATE_PARAGRAPH,
DOCUMINE,
DOCUMINE_OLD,
CLARIFYND,

View File

@ -51,8 +51,8 @@ import com.knecon.fforesight.service.layoutparser.processor.services.RulingClean
import com.knecon.fforesight.service.layoutparser.processor.services.SectionsBuilderService;
import com.knecon.fforesight.service.layoutparser.processor.services.SimplifiedSectionTextService;
import com.knecon.fforesight.service.layoutparser.processor.services.TableExtractionService;
import com.knecon.fforesight.service.layoutparser.processor.services.blockification.BlockificationPostprocessingService;
import com.knecon.fforesight.service.layoutparser.processor.services.TextRulingsClassifier;
import com.knecon.fforesight.service.layoutparser.processor.services.blockification.BlockificationPostprocessingService;
import com.knecon.fforesight.service.layoutparser.processor.services.blockification.DocstrumBlockificationService;
import com.knecon.fforesight.service.layoutparser.processor.services.blockification.DocuMineBlockificationService;
import com.knecon.fforesight.service.layoutparser.processor.services.blockification.RedactManagerBlockificationService;
@ -319,10 +319,16 @@ public class LayoutParsingPipeline {
case REDACT_MANAGER_OLD ->
redactManagerBlockificationService.blockify(stripper.getTextPositionSequences(), cleanRulings, classificationDocument.getVisualizations());
case DOCUMINE_OLD -> docuMineBlockificationService.blockify(words, cleanRulings);
case DOCUMINE, REDACT_MANAGER, REDACT_MANAGER_PARAGRAPH_DEBUG ->
docstrumBlockificationService.blockify(words, cleanRulings, true, classificationDocument.getVisualizations(), layoutParsingType);
case CLARIFYND, CLARIFYND_PARAGRAPH_DEBUG ->
docstrumBlockificationService.blockify(words, cleanRulings, false, classificationDocument.getVisualizations(), layoutParsingType);
case DOCUMINE, REDACT_MANAGER, REDACT_MANAGER_PARAGRAPH_DEBUG, REDACT_MANAGER_WITHOUT_DUPLICATE_PARAGRAPH -> docstrumBlockificationService.blockify(words,
cleanRulings,
true,
classificationDocument.getVisualizations(),
layoutParsingType);
case CLARIFYND, CLARIFYND_PARAGRAPH_DEBUG -> docstrumBlockificationService.blockify(words,
cleanRulings,
false,
classificationDocument.getVisualizations(),
layoutParsingType);
};
classificationPage.setCleanRulings(cleanRulings);
@ -381,8 +387,8 @@ public class LayoutParsingPipeline {
}
log.info("Classify TextBlocks for {}", identifier);
switch (layoutParsingType) {
case REDACT_MANAGER, REDACT_MANAGER_PARAGRAPH_DEBUG, REDACT_MANAGER_OLD, CLARIFYND_PARAGRAPH_DEBUG ->
redactManagerClassificationService.classifyDocument(classificationDocument);
case REDACT_MANAGER, REDACT_MANAGER_PARAGRAPH_DEBUG, REDACT_MANAGER_OLD, CLARIFYND_PARAGRAPH_DEBUG, REDACT_MANAGER_WITHOUT_DUPLICATE_PARAGRAPH -> redactManagerClassificationService.classifyDocument(
classificationDocument);
case DOCUMINE_OLD, DOCUMINE -> docuMineClassificationService.classifyDocument(classificationDocument);
case CLARIFYND -> clarifyndClassificationService.classifyDocument(classificationDocument);
}

View File

@ -59,8 +59,10 @@ public class DocstrumBlockificationService {
mergeIntersectingBlocks(classificationPage, usedRulings, 0, 0);
if (layoutParsingType == LayoutParsingType.DOCUMINE || layoutParsingType == LayoutParsingType.REDACT_MANAGER) {
combineBlocks(classificationPage);
if (layoutParsingType == LayoutParsingType.DOCUMINE
|| layoutParsingType == LayoutParsingType.REDACT_MANAGER
|| layoutParsingType == LayoutParsingType.REDACT_MANAGER_WITHOUT_DUPLICATE_PARAGRAPH) {
combineBlocks(classificationPage, layoutParsingType);
}
if (layoutParsingType == LayoutParsingType.CLARIFYND) {
@ -106,7 +108,7 @@ public class DocstrumBlockificationService {
}
public void combineBlocks(ClassificationPage page) {
public void combineBlocks(ClassificationPage page, LayoutParsingType layoutParsingType) {
TextPageBlock previous = new TextPageBlock();
ListIterator<AbstractPageBlock> itty = page.getTextBlocks().listIterator();
@ -138,7 +140,8 @@ public class DocstrumBlockificationService {
}
if (areTheOnlyTwoBlocksOnHeightsWithBothMoreThanTwoLines(previous, current, page)) {
previous = combineBlocksAndResetIterator(previous, current, itty, true);
// previous = combineBlocksAndResetIterator(previous, current, itty, true);
previous = combineBlocksAndResetIterator(previous, current, itty, layoutParsingType != LayoutParsingType.REDACT_MANAGER_WITHOUT_DUPLICATE_PARAGRAPH);
continue;
}

View File

@ -70,7 +70,7 @@ public class LayoutparserEnd2EndTest extends AbstractTest {
file = new File(filePath);
}
LayoutParsingRequest layoutParsingRequest = buildDefaultLayoutParsingRequest(fileName, LayoutParsingType.REDACT_MANAGER, true);
LayoutParsingRequest layoutParsingRequest = buildDefaultLayoutParsingRequest(fileName, LayoutParsingType.REDACT_MANAGER_WITHOUT_DUPLICATE_PARAGRAPH, true);
prepareStorage(layoutParsingRequest, file);
LayoutParsingFinishedEvent finishedEvent = layoutParsingPipeline.parseLayoutAndSaveFilesToStorage(layoutParsingRequest);

View File

@ -56,7 +56,7 @@ public class OutlineDetectionTest extends AbstractTest {
var documentFile = new ClassPathResource(fileName).getFile();
long start = System.currentTimeMillis();
ClassificationDocument classificationDocument = parseLayout(fileName, LayoutParsingType.REDACT_MANAGER);
ClassificationDocument classificationDocument = parseLayout(fileName, LayoutParsingType.REDACT_MANAGER_WITHOUT_DUPLICATE_PARAGRAPH);
OutlineObjectTree outlineObjectTree = classificationDocument.getOutlineObjectTree();
assertEquals(outlineObjectTree.getRootNodes().size(), 8);

View File

@ -64,8 +64,8 @@ public class SimplifiedTextServiceTest
@SneakyThrows
protected Document buildGraph(File file) {
return DocumentGraphFactory.buildDocumentGraph(LayoutParsingType.REDACT_MANAGER,
layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER,
return DocumentGraphFactory.buildDocumentGraph(LayoutParsingType.REDACT_MANAGER_WITHOUT_DUPLICATE_PARAGRAPH,
layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER_WITHOUT_DUPLICATE_PARAGRAPH,
file,
new ImageServiceResponse(),
new TableServiceResponse(),

View File

@ -56,8 +56,8 @@ public class DocumentGraphJsonWritingTest extends BuildDocumentTest {
@SneakyThrows
private void writeJsons(Path filename) {
Document documentGraph = DocumentGraphFactory.buildDocumentGraph(LayoutParsingType.REDACT_MANAGER_OLD,
layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER_OLD,
Document documentGraph = DocumentGraphFactory.buildDocumentGraph(LayoutParsingType.REDACT_MANAGER_WITHOUT_DUPLICATE_PARAGRAPH,
layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER_WITHOUT_DUPLICATE_PARAGRAPH,
filename.toFile(),
new ImageServiceResponse(),
new TableServiceResponse(),

View File

@ -38,7 +38,7 @@ public class ViewerDocumentTest extends BuildDocumentTest {
var documentFile = new ClassPathResource(fileName).getFile();
long start = System.currentTimeMillis();
Document document = buildGraph(fileName, LayoutParsingType.REDACT_MANAGER);
Document document = buildGraph(fileName, LayoutParsingType.REDACT_MANAGER_WITHOUT_DUPLICATE_PARAGRAPH);
layoutGridService.addLayoutGrid(documentFile, document, new File(tmpFileName), true, false);
System.out.printf("Total time: %.2fs%n", ((float) (System.currentTimeMillis() - start)) / 1000);
}

View File

@ -54,7 +54,7 @@ public class PdfSegmentationServiceTest extends AbstractTest {
@SneakyThrows
public ClassificationDocument buildClassificationDocument(File originDocument, TableServiceResponse tableServiceResponse) {
return layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER,
return layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER_WITHOUT_DUPLICATE_PARAGRAPH,
originDocument,
new ImageServiceResponse(),
tableServiceResponse,
@ -122,7 +122,7 @@ public class PdfSegmentationServiceTest extends AbstractTest {
.get(0).getTextBlocks()
.get(0).toString()).contains(textToSearch);
Document document = DocumentGraphFactory.buildDocumentGraph(LayoutParsingType.REDACT_MANAGER, classificationDocument);
Document document = DocumentGraphFactory.buildDocumentGraph(LayoutParsingType.REDACT_MANAGER_WITHOUT_DUPLICATE_PARAGRAPH, classificationDocument);
TextBlock leafTextBlock = document.getFirstPage().getHeader().getLeafTextBlock();
assertThat(leafTextBlock.getSearchText().contains(textToSearch)).isTrue();

View File

@ -103,15 +103,15 @@ public class RulingCleaningServiceTest extends BuildDocumentTest {
@SneakyThrows
private void writeJsons(Path filename) {
Document documentGraphBefore = DocumentGraphFactory.buildDocumentGraph(LayoutParsingType.REDACT_MANAGER,
layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER,
Document documentGraphBefore = DocumentGraphFactory.buildDocumentGraph(LayoutParsingType.REDACT_MANAGER_WITHOUT_DUPLICATE_PARAGRAPH,
layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER_WITHOUT_DUPLICATE_PARAGRAPH,
filename.toFile(),
new ImageServiceResponse(),
new TableServiceResponse(),
new VisualLayoutParsingResponse(),
Map.of("file",filename.toFile().toString())));
Document documentGraphAfter = DocumentGraphFactory.buildDocumentGraph(LayoutParsingType.REDACT_MANAGER,
layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER,
Document documentGraphAfter = DocumentGraphFactory.buildDocumentGraph(LayoutParsingType.REDACT_MANAGER_WITHOUT_DUPLICATE_PARAGRAPH,
layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER_WITHOUT_DUPLICATE_PARAGRAPH,
filename.toFile(),
new ImageServiceResponse(),
new TableServiceResponse(),

View File

@ -50,7 +50,7 @@ public abstract class BuildDocumentTest extends AbstractTest {
if (!filename.startsWith("files") && filename.startsWith("/")) {
LayoutParsingRequest layoutParsingRequest = buildDefaultLayoutParsingRequest(Path.of(filename).getFileName().toString(), LayoutParsingType.REDACT_MANAGER, true);
LayoutParsingRequest layoutParsingRequest = buildDefaultLayoutParsingRequest(Path.of(filename).getFileName().toString(), LayoutParsingType.REDACT_MANAGER_WITHOUT_DUPLICATE_PARAGRAPH, true);
prepareStorage(layoutParsingRequest, new File(filename));
return DocumentGraphFactory.buildDocumentGraph(layoutParsingType,
layoutParsingPipeline.parseLayout(layoutParsingType,