Merge branch 'RED-9206' into 'main'

RED-9206 - Sections are no longer correctly separated from each other in the test file

See merge request fforesight/layout-parser!163
This commit is contained in:
Corina Olariu 2024-06-05 13:13:45 +02:00
commit b5cfa7b63d
11 changed files with 35 additions and 25 deletions

View File

@ -4,6 +4,7 @@ public enum LayoutParsingType {
REDACT_MANAGER, REDACT_MANAGER,
REDACT_MANAGER_OLD, REDACT_MANAGER_OLD,
REDACT_MANAGER_PARAGRAPH_DEBUG, REDACT_MANAGER_PARAGRAPH_DEBUG,
REDACT_MANAGER_WITHOUT_DUPLICATE_PARAGRAPH,
DOCUMINE, DOCUMINE,
DOCUMINE_OLD, DOCUMINE_OLD,
CLARIFYND, CLARIFYND,

View File

@ -51,8 +51,8 @@ import com.knecon.fforesight.service.layoutparser.processor.services.RulingClean
import com.knecon.fforesight.service.layoutparser.processor.services.SectionsBuilderService; import com.knecon.fforesight.service.layoutparser.processor.services.SectionsBuilderService;
import com.knecon.fforesight.service.layoutparser.processor.services.SimplifiedSectionTextService; import com.knecon.fforesight.service.layoutparser.processor.services.SimplifiedSectionTextService;
import com.knecon.fforesight.service.layoutparser.processor.services.TableExtractionService; import com.knecon.fforesight.service.layoutparser.processor.services.TableExtractionService;
import com.knecon.fforesight.service.layoutparser.processor.services.blockification.BlockificationPostprocessingService;
import com.knecon.fforesight.service.layoutparser.processor.services.TextRulingsClassifier; import com.knecon.fforesight.service.layoutparser.processor.services.TextRulingsClassifier;
import com.knecon.fforesight.service.layoutparser.processor.services.blockification.BlockificationPostprocessingService;
import com.knecon.fforesight.service.layoutparser.processor.services.blockification.DocstrumBlockificationService; import com.knecon.fforesight.service.layoutparser.processor.services.blockification.DocstrumBlockificationService;
import com.knecon.fforesight.service.layoutparser.processor.services.blockification.DocuMineBlockificationService; import com.knecon.fforesight.service.layoutparser.processor.services.blockification.DocuMineBlockificationService;
import com.knecon.fforesight.service.layoutparser.processor.services.blockification.RedactManagerBlockificationService; import com.knecon.fforesight.service.layoutparser.processor.services.blockification.RedactManagerBlockificationService;
@ -319,10 +319,16 @@ public class LayoutParsingPipeline {
case REDACT_MANAGER_OLD -> case REDACT_MANAGER_OLD ->
redactManagerBlockificationService.blockify(stripper.getTextPositionSequences(), cleanRulings, classificationDocument.getVisualizations()); redactManagerBlockificationService.blockify(stripper.getTextPositionSequences(), cleanRulings, classificationDocument.getVisualizations());
case DOCUMINE_OLD -> docuMineBlockificationService.blockify(words, cleanRulings); case DOCUMINE_OLD -> docuMineBlockificationService.blockify(words, cleanRulings);
case DOCUMINE, REDACT_MANAGER, REDACT_MANAGER_PARAGRAPH_DEBUG -> case DOCUMINE, REDACT_MANAGER, REDACT_MANAGER_PARAGRAPH_DEBUG, REDACT_MANAGER_WITHOUT_DUPLICATE_PARAGRAPH -> docstrumBlockificationService.blockify(words,
docstrumBlockificationService.blockify(words, cleanRulings, true, classificationDocument.getVisualizations(), layoutParsingType); cleanRulings,
case CLARIFYND, CLARIFYND_PARAGRAPH_DEBUG -> true,
docstrumBlockificationService.blockify(words, cleanRulings, false, classificationDocument.getVisualizations(), layoutParsingType); classificationDocument.getVisualizations(),
layoutParsingType);
case CLARIFYND, CLARIFYND_PARAGRAPH_DEBUG -> docstrumBlockificationService.blockify(words,
cleanRulings,
false,
classificationDocument.getVisualizations(),
layoutParsingType);
}; };
classificationPage.setCleanRulings(cleanRulings); classificationPage.setCleanRulings(cleanRulings);
@ -381,8 +387,8 @@ public class LayoutParsingPipeline {
} }
log.info("Classify TextBlocks for {}", identifier); log.info("Classify TextBlocks for {}", identifier);
switch (layoutParsingType) { switch (layoutParsingType) {
case REDACT_MANAGER, REDACT_MANAGER_PARAGRAPH_DEBUG, REDACT_MANAGER_OLD, CLARIFYND_PARAGRAPH_DEBUG -> case REDACT_MANAGER, REDACT_MANAGER_PARAGRAPH_DEBUG, REDACT_MANAGER_OLD, CLARIFYND_PARAGRAPH_DEBUG, REDACT_MANAGER_WITHOUT_DUPLICATE_PARAGRAPH -> redactManagerClassificationService.classifyDocument(
redactManagerClassificationService.classifyDocument(classificationDocument); classificationDocument);
case DOCUMINE_OLD, DOCUMINE -> docuMineClassificationService.classifyDocument(classificationDocument); case DOCUMINE_OLD, DOCUMINE -> docuMineClassificationService.classifyDocument(classificationDocument);
case CLARIFYND -> clarifyndClassificationService.classifyDocument(classificationDocument); case CLARIFYND -> clarifyndClassificationService.classifyDocument(classificationDocument);
} }

View File

@ -59,8 +59,10 @@ public class DocstrumBlockificationService {
mergeIntersectingBlocks(classificationPage, usedRulings, 0, 0); mergeIntersectingBlocks(classificationPage, usedRulings, 0, 0);
if (layoutParsingType == LayoutParsingType.DOCUMINE || layoutParsingType == LayoutParsingType.REDACT_MANAGER) { if (layoutParsingType == LayoutParsingType.DOCUMINE
combineBlocks(classificationPage); || layoutParsingType == LayoutParsingType.REDACT_MANAGER
|| layoutParsingType == LayoutParsingType.REDACT_MANAGER_WITHOUT_DUPLICATE_PARAGRAPH) {
combineBlocks(classificationPage, layoutParsingType);
} }
if (layoutParsingType == LayoutParsingType.CLARIFYND) { if (layoutParsingType == LayoutParsingType.CLARIFYND) {
@ -106,7 +108,7 @@ public class DocstrumBlockificationService {
} }
public void combineBlocks(ClassificationPage page) { public void combineBlocks(ClassificationPage page, LayoutParsingType layoutParsingType) {
TextPageBlock previous = new TextPageBlock(); TextPageBlock previous = new TextPageBlock();
ListIterator<AbstractPageBlock> itty = page.getTextBlocks().listIterator(); ListIterator<AbstractPageBlock> itty = page.getTextBlocks().listIterator();
@ -138,7 +140,8 @@ public class DocstrumBlockificationService {
} }
if (areTheOnlyTwoBlocksOnHeightsWithBothMoreThanTwoLines(previous, current, page)) { if (areTheOnlyTwoBlocksOnHeightsWithBothMoreThanTwoLines(previous, current, page)) {
previous = combineBlocksAndResetIterator(previous, current, itty, true); // previous = combineBlocksAndResetIterator(previous, current, itty, true);
previous = combineBlocksAndResetIterator(previous, current, itty, layoutParsingType != LayoutParsingType.REDACT_MANAGER_WITHOUT_DUPLICATE_PARAGRAPH);
continue; continue;
} }

View File

@ -70,7 +70,7 @@ public class LayoutparserEnd2EndTest extends AbstractTest {
file = new File(filePath); file = new File(filePath);
} }
LayoutParsingRequest layoutParsingRequest = buildDefaultLayoutParsingRequest(fileName, LayoutParsingType.REDACT_MANAGER, true); LayoutParsingRequest layoutParsingRequest = buildDefaultLayoutParsingRequest(fileName, LayoutParsingType.REDACT_MANAGER_WITHOUT_DUPLICATE_PARAGRAPH, true);
prepareStorage(layoutParsingRequest, file); prepareStorage(layoutParsingRequest, file);
LayoutParsingFinishedEvent finishedEvent = layoutParsingPipeline.parseLayoutAndSaveFilesToStorage(layoutParsingRequest); LayoutParsingFinishedEvent finishedEvent = layoutParsingPipeline.parseLayoutAndSaveFilesToStorage(layoutParsingRequest);

View File

@ -56,7 +56,7 @@ public class OutlineDetectionTest extends AbstractTest {
var documentFile = new ClassPathResource(fileName).getFile(); var documentFile = new ClassPathResource(fileName).getFile();
long start = System.currentTimeMillis(); long start = System.currentTimeMillis();
ClassificationDocument classificationDocument = parseLayout(fileName, LayoutParsingType.REDACT_MANAGER); ClassificationDocument classificationDocument = parseLayout(fileName, LayoutParsingType.REDACT_MANAGER_WITHOUT_DUPLICATE_PARAGRAPH);
OutlineObjectTree outlineObjectTree = classificationDocument.getOutlineObjectTree(); OutlineObjectTree outlineObjectTree = classificationDocument.getOutlineObjectTree();
assertEquals(outlineObjectTree.getRootNodes().size(), 8); assertEquals(outlineObjectTree.getRootNodes().size(), 8);

View File

@ -64,8 +64,8 @@ public class SimplifiedTextServiceTest
@SneakyThrows @SneakyThrows
protected Document buildGraph(File file) { protected Document buildGraph(File file) {
return DocumentGraphFactory.buildDocumentGraph(LayoutParsingType.REDACT_MANAGER, return DocumentGraphFactory.buildDocumentGraph(LayoutParsingType.REDACT_MANAGER_WITHOUT_DUPLICATE_PARAGRAPH,
layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER, layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER_WITHOUT_DUPLICATE_PARAGRAPH,
file, file,
new ImageServiceResponse(), new ImageServiceResponse(),
new TableServiceResponse(), new TableServiceResponse(),

View File

@ -56,8 +56,8 @@ public class DocumentGraphJsonWritingTest extends BuildDocumentTest {
@SneakyThrows @SneakyThrows
private void writeJsons(Path filename) { private void writeJsons(Path filename) {
Document documentGraph = DocumentGraphFactory.buildDocumentGraph(LayoutParsingType.REDACT_MANAGER_OLD, Document documentGraph = DocumentGraphFactory.buildDocumentGraph(LayoutParsingType.REDACT_MANAGER_WITHOUT_DUPLICATE_PARAGRAPH,
layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER_OLD, layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER_WITHOUT_DUPLICATE_PARAGRAPH,
filename.toFile(), filename.toFile(),
new ImageServiceResponse(), new ImageServiceResponse(),
new TableServiceResponse(), new TableServiceResponse(),

View File

@ -38,7 +38,7 @@ public class ViewerDocumentTest extends BuildDocumentTest {
var documentFile = new ClassPathResource(fileName).getFile(); var documentFile = new ClassPathResource(fileName).getFile();
long start = System.currentTimeMillis(); long start = System.currentTimeMillis();
Document document = buildGraph(fileName, LayoutParsingType.REDACT_MANAGER); Document document = buildGraph(fileName, LayoutParsingType.REDACT_MANAGER_WITHOUT_DUPLICATE_PARAGRAPH);
layoutGridService.addLayoutGrid(documentFile, document, new File(tmpFileName), true, false); layoutGridService.addLayoutGrid(documentFile, document, new File(tmpFileName), true, false);
System.out.printf("Total time: %.2fs%n", ((float) (System.currentTimeMillis() - start)) / 1000); System.out.printf("Total time: %.2fs%n", ((float) (System.currentTimeMillis() - start)) / 1000);
} }

View File

@ -54,7 +54,7 @@ public class PdfSegmentationServiceTest extends AbstractTest {
@SneakyThrows @SneakyThrows
public ClassificationDocument buildClassificationDocument(File originDocument, TableServiceResponse tableServiceResponse) { public ClassificationDocument buildClassificationDocument(File originDocument, TableServiceResponse tableServiceResponse) {
return layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER, return layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER_WITHOUT_DUPLICATE_PARAGRAPH,
originDocument, originDocument,
new ImageServiceResponse(), new ImageServiceResponse(),
tableServiceResponse, tableServiceResponse,
@ -122,7 +122,7 @@ public class PdfSegmentationServiceTest extends AbstractTest {
.get(0).getTextBlocks() .get(0).getTextBlocks()
.get(0).toString()).contains(textToSearch); .get(0).toString()).contains(textToSearch);
Document document = DocumentGraphFactory.buildDocumentGraph(LayoutParsingType.REDACT_MANAGER, classificationDocument); Document document = DocumentGraphFactory.buildDocumentGraph(LayoutParsingType.REDACT_MANAGER_WITHOUT_DUPLICATE_PARAGRAPH, classificationDocument);
TextBlock leafTextBlock = document.getFirstPage().getHeader().getLeafTextBlock(); TextBlock leafTextBlock = document.getFirstPage().getHeader().getLeafTextBlock();
assertThat(leafTextBlock.getSearchText().contains(textToSearch)).isTrue(); assertThat(leafTextBlock.getSearchText().contains(textToSearch)).isTrue();

View File

@ -103,15 +103,15 @@ public class RulingCleaningServiceTest extends BuildDocumentTest {
@SneakyThrows @SneakyThrows
private void writeJsons(Path filename) { private void writeJsons(Path filename) {
Document documentGraphBefore = DocumentGraphFactory.buildDocumentGraph(LayoutParsingType.REDACT_MANAGER, Document documentGraphBefore = DocumentGraphFactory.buildDocumentGraph(LayoutParsingType.REDACT_MANAGER_WITHOUT_DUPLICATE_PARAGRAPH,
layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER, layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER_WITHOUT_DUPLICATE_PARAGRAPH,
filename.toFile(), filename.toFile(),
new ImageServiceResponse(), new ImageServiceResponse(),
new TableServiceResponse(), new TableServiceResponse(),
new VisualLayoutParsingResponse(), new VisualLayoutParsingResponse(),
Map.of("file",filename.toFile().toString()))); Map.of("file",filename.toFile().toString())));
Document documentGraphAfter = DocumentGraphFactory.buildDocumentGraph(LayoutParsingType.REDACT_MANAGER, Document documentGraphAfter = DocumentGraphFactory.buildDocumentGraph(LayoutParsingType.REDACT_MANAGER_WITHOUT_DUPLICATE_PARAGRAPH,
layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER, layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER_WITHOUT_DUPLICATE_PARAGRAPH,
filename.toFile(), filename.toFile(),
new ImageServiceResponse(), new ImageServiceResponse(),
new TableServiceResponse(), new TableServiceResponse(),

View File

@ -50,7 +50,7 @@ public abstract class BuildDocumentTest extends AbstractTest {
if (!filename.startsWith("files") && filename.startsWith("/")) { if (!filename.startsWith("files") && filename.startsWith("/")) {
LayoutParsingRequest layoutParsingRequest = buildDefaultLayoutParsingRequest(Path.of(filename).getFileName().toString(), LayoutParsingType.REDACT_MANAGER, true); LayoutParsingRequest layoutParsingRequest = buildDefaultLayoutParsingRequest(Path.of(filename).getFileName().toString(), LayoutParsingType.REDACT_MANAGER_WITHOUT_DUPLICATE_PARAGRAPH, true);
prepareStorage(layoutParsingRequest, new File(filename)); prepareStorage(layoutParsingRequest, new File(filename));
return DocumentGraphFactory.buildDocumentGraph(layoutParsingType, return DocumentGraphFactory.buildDocumentGraph(layoutParsingType,
layoutParsingPipeline.parseLayout(layoutParsingType, layoutParsingPipeline.parseLayout(layoutParsingType,