Merge branch 'RED-9206' into 'main'
RED-9206 - Sections are no longer correctly separated from each other in the test file See merge request fforesight/layout-parser!163
This commit is contained in:
commit
b5cfa7b63d
@ -4,6 +4,7 @@ public enum LayoutParsingType {
|
|||||||
REDACT_MANAGER,
|
REDACT_MANAGER,
|
||||||
REDACT_MANAGER_OLD,
|
REDACT_MANAGER_OLD,
|
||||||
REDACT_MANAGER_PARAGRAPH_DEBUG,
|
REDACT_MANAGER_PARAGRAPH_DEBUG,
|
||||||
|
REDACT_MANAGER_WITHOUT_DUPLICATE_PARAGRAPH,
|
||||||
DOCUMINE,
|
DOCUMINE,
|
||||||
DOCUMINE_OLD,
|
DOCUMINE_OLD,
|
||||||
CLARIFYND,
|
CLARIFYND,
|
||||||
|
|||||||
@ -51,8 +51,8 @@ import com.knecon.fforesight.service.layoutparser.processor.services.RulingClean
|
|||||||
import com.knecon.fforesight.service.layoutparser.processor.services.SectionsBuilderService;
|
import com.knecon.fforesight.service.layoutparser.processor.services.SectionsBuilderService;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.services.SimplifiedSectionTextService;
|
import com.knecon.fforesight.service.layoutparser.processor.services.SimplifiedSectionTextService;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.services.TableExtractionService;
|
import com.knecon.fforesight.service.layoutparser.processor.services.TableExtractionService;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.services.blockification.BlockificationPostprocessingService;
|
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.services.TextRulingsClassifier;
|
import com.knecon.fforesight.service.layoutparser.processor.services.TextRulingsClassifier;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.services.blockification.BlockificationPostprocessingService;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.services.blockification.DocstrumBlockificationService;
|
import com.knecon.fforesight.service.layoutparser.processor.services.blockification.DocstrumBlockificationService;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.services.blockification.DocuMineBlockificationService;
|
import com.knecon.fforesight.service.layoutparser.processor.services.blockification.DocuMineBlockificationService;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.services.blockification.RedactManagerBlockificationService;
|
import com.knecon.fforesight.service.layoutparser.processor.services.blockification.RedactManagerBlockificationService;
|
||||||
@ -319,10 +319,16 @@ public class LayoutParsingPipeline {
|
|||||||
case REDACT_MANAGER_OLD ->
|
case REDACT_MANAGER_OLD ->
|
||||||
redactManagerBlockificationService.blockify(stripper.getTextPositionSequences(), cleanRulings, classificationDocument.getVisualizations());
|
redactManagerBlockificationService.blockify(stripper.getTextPositionSequences(), cleanRulings, classificationDocument.getVisualizations());
|
||||||
case DOCUMINE_OLD -> docuMineBlockificationService.blockify(words, cleanRulings);
|
case DOCUMINE_OLD -> docuMineBlockificationService.blockify(words, cleanRulings);
|
||||||
case DOCUMINE, REDACT_MANAGER, REDACT_MANAGER_PARAGRAPH_DEBUG ->
|
case DOCUMINE, REDACT_MANAGER, REDACT_MANAGER_PARAGRAPH_DEBUG, REDACT_MANAGER_WITHOUT_DUPLICATE_PARAGRAPH -> docstrumBlockificationService.blockify(words,
|
||||||
docstrumBlockificationService.blockify(words, cleanRulings, true, classificationDocument.getVisualizations(), layoutParsingType);
|
cleanRulings,
|
||||||
case CLARIFYND, CLARIFYND_PARAGRAPH_DEBUG ->
|
true,
|
||||||
docstrumBlockificationService.blockify(words, cleanRulings, false, classificationDocument.getVisualizations(), layoutParsingType);
|
classificationDocument.getVisualizations(),
|
||||||
|
layoutParsingType);
|
||||||
|
case CLARIFYND, CLARIFYND_PARAGRAPH_DEBUG -> docstrumBlockificationService.blockify(words,
|
||||||
|
cleanRulings,
|
||||||
|
false,
|
||||||
|
classificationDocument.getVisualizations(),
|
||||||
|
layoutParsingType);
|
||||||
};
|
};
|
||||||
|
|
||||||
classificationPage.setCleanRulings(cleanRulings);
|
classificationPage.setCleanRulings(cleanRulings);
|
||||||
@ -381,8 +387,8 @@ public class LayoutParsingPipeline {
|
|||||||
}
|
}
|
||||||
log.info("Classify TextBlocks for {}", identifier);
|
log.info("Classify TextBlocks for {}", identifier);
|
||||||
switch (layoutParsingType) {
|
switch (layoutParsingType) {
|
||||||
case REDACT_MANAGER, REDACT_MANAGER_PARAGRAPH_DEBUG, REDACT_MANAGER_OLD, CLARIFYND_PARAGRAPH_DEBUG ->
|
case REDACT_MANAGER, REDACT_MANAGER_PARAGRAPH_DEBUG, REDACT_MANAGER_OLD, CLARIFYND_PARAGRAPH_DEBUG, REDACT_MANAGER_WITHOUT_DUPLICATE_PARAGRAPH -> redactManagerClassificationService.classifyDocument(
|
||||||
redactManagerClassificationService.classifyDocument(classificationDocument);
|
classificationDocument);
|
||||||
case DOCUMINE_OLD, DOCUMINE -> docuMineClassificationService.classifyDocument(classificationDocument);
|
case DOCUMINE_OLD, DOCUMINE -> docuMineClassificationService.classifyDocument(classificationDocument);
|
||||||
case CLARIFYND -> clarifyndClassificationService.classifyDocument(classificationDocument);
|
case CLARIFYND -> clarifyndClassificationService.classifyDocument(classificationDocument);
|
||||||
}
|
}
|
||||||
|
|||||||
@ -59,8 +59,10 @@ public class DocstrumBlockificationService {
|
|||||||
|
|
||||||
mergeIntersectingBlocks(classificationPage, usedRulings, 0, 0);
|
mergeIntersectingBlocks(classificationPage, usedRulings, 0, 0);
|
||||||
|
|
||||||
if (layoutParsingType == LayoutParsingType.DOCUMINE || layoutParsingType == LayoutParsingType.REDACT_MANAGER) {
|
if (layoutParsingType == LayoutParsingType.DOCUMINE
|
||||||
combineBlocks(classificationPage);
|
|| layoutParsingType == LayoutParsingType.REDACT_MANAGER
|
||||||
|
|| layoutParsingType == LayoutParsingType.REDACT_MANAGER_WITHOUT_DUPLICATE_PARAGRAPH) {
|
||||||
|
combineBlocks(classificationPage, layoutParsingType);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (layoutParsingType == LayoutParsingType.CLARIFYND) {
|
if (layoutParsingType == LayoutParsingType.CLARIFYND) {
|
||||||
@ -106,7 +108,7 @@ public class DocstrumBlockificationService {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
public void combineBlocks(ClassificationPage page) {
|
public void combineBlocks(ClassificationPage page, LayoutParsingType layoutParsingType) {
|
||||||
|
|
||||||
TextPageBlock previous = new TextPageBlock();
|
TextPageBlock previous = new TextPageBlock();
|
||||||
ListIterator<AbstractPageBlock> itty = page.getTextBlocks().listIterator();
|
ListIterator<AbstractPageBlock> itty = page.getTextBlocks().listIterator();
|
||||||
@ -138,7 +140,8 @@ public class DocstrumBlockificationService {
|
|||||||
}
|
}
|
||||||
|
|
||||||
if (areTheOnlyTwoBlocksOnHeightsWithBothMoreThanTwoLines(previous, current, page)) {
|
if (areTheOnlyTwoBlocksOnHeightsWithBothMoreThanTwoLines(previous, current, page)) {
|
||||||
previous = combineBlocksAndResetIterator(previous, current, itty, true);
|
// previous = combineBlocksAndResetIterator(previous, current, itty, true);
|
||||||
|
previous = combineBlocksAndResetIterator(previous, current, itty, layoutParsingType != LayoutParsingType.REDACT_MANAGER_WITHOUT_DUPLICATE_PARAGRAPH);
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@ -70,7 +70,7 @@ public class LayoutparserEnd2EndTest extends AbstractTest {
|
|||||||
file = new File(filePath);
|
file = new File(filePath);
|
||||||
}
|
}
|
||||||
|
|
||||||
LayoutParsingRequest layoutParsingRequest = buildDefaultLayoutParsingRequest(fileName, LayoutParsingType.REDACT_MANAGER, true);
|
LayoutParsingRequest layoutParsingRequest = buildDefaultLayoutParsingRequest(fileName, LayoutParsingType.REDACT_MANAGER_WITHOUT_DUPLICATE_PARAGRAPH, true);
|
||||||
prepareStorage(layoutParsingRequest, file);
|
prepareStorage(layoutParsingRequest, file);
|
||||||
|
|
||||||
LayoutParsingFinishedEvent finishedEvent = layoutParsingPipeline.parseLayoutAndSaveFilesToStorage(layoutParsingRequest);
|
LayoutParsingFinishedEvent finishedEvent = layoutParsingPipeline.parseLayoutAndSaveFilesToStorage(layoutParsingRequest);
|
||||||
|
|||||||
@ -56,7 +56,7 @@ public class OutlineDetectionTest extends AbstractTest {
|
|||||||
var documentFile = new ClassPathResource(fileName).getFile();
|
var documentFile = new ClassPathResource(fileName).getFile();
|
||||||
|
|
||||||
long start = System.currentTimeMillis();
|
long start = System.currentTimeMillis();
|
||||||
ClassificationDocument classificationDocument = parseLayout(fileName, LayoutParsingType.REDACT_MANAGER);
|
ClassificationDocument classificationDocument = parseLayout(fileName, LayoutParsingType.REDACT_MANAGER_WITHOUT_DUPLICATE_PARAGRAPH);
|
||||||
|
|
||||||
OutlineObjectTree outlineObjectTree = classificationDocument.getOutlineObjectTree();
|
OutlineObjectTree outlineObjectTree = classificationDocument.getOutlineObjectTree();
|
||||||
assertEquals(outlineObjectTree.getRootNodes().size(), 8);
|
assertEquals(outlineObjectTree.getRootNodes().size(), 8);
|
||||||
|
|||||||
@ -64,8 +64,8 @@ public class SimplifiedTextServiceTest
|
|||||||
@SneakyThrows
|
@SneakyThrows
|
||||||
protected Document buildGraph(File file) {
|
protected Document buildGraph(File file) {
|
||||||
|
|
||||||
return DocumentGraphFactory.buildDocumentGraph(LayoutParsingType.REDACT_MANAGER,
|
return DocumentGraphFactory.buildDocumentGraph(LayoutParsingType.REDACT_MANAGER_WITHOUT_DUPLICATE_PARAGRAPH,
|
||||||
layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER,
|
layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER_WITHOUT_DUPLICATE_PARAGRAPH,
|
||||||
file,
|
file,
|
||||||
new ImageServiceResponse(),
|
new ImageServiceResponse(),
|
||||||
new TableServiceResponse(),
|
new TableServiceResponse(),
|
||||||
|
|||||||
@ -56,8 +56,8 @@ public class DocumentGraphJsonWritingTest extends BuildDocumentTest {
|
|||||||
@SneakyThrows
|
@SneakyThrows
|
||||||
private void writeJsons(Path filename) {
|
private void writeJsons(Path filename) {
|
||||||
|
|
||||||
Document documentGraph = DocumentGraphFactory.buildDocumentGraph(LayoutParsingType.REDACT_MANAGER_OLD,
|
Document documentGraph = DocumentGraphFactory.buildDocumentGraph(LayoutParsingType.REDACT_MANAGER_WITHOUT_DUPLICATE_PARAGRAPH,
|
||||||
layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER_OLD,
|
layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER_WITHOUT_DUPLICATE_PARAGRAPH,
|
||||||
filename.toFile(),
|
filename.toFile(),
|
||||||
new ImageServiceResponse(),
|
new ImageServiceResponse(),
|
||||||
new TableServiceResponse(),
|
new TableServiceResponse(),
|
||||||
|
|||||||
@ -38,7 +38,7 @@ public class ViewerDocumentTest extends BuildDocumentTest {
|
|||||||
var documentFile = new ClassPathResource(fileName).getFile();
|
var documentFile = new ClassPathResource(fileName).getFile();
|
||||||
|
|
||||||
long start = System.currentTimeMillis();
|
long start = System.currentTimeMillis();
|
||||||
Document document = buildGraph(fileName, LayoutParsingType.REDACT_MANAGER);
|
Document document = buildGraph(fileName, LayoutParsingType.REDACT_MANAGER_WITHOUT_DUPLICATE_PARAGRAPH);
|
||||||
layoutGridService.addLayoutGrid(documentFile, document, new File(tmpFileName), true, false);
|
layoutGridService.addLayoutGrid(documentFile, document, new File(tmpFileName), true, false);
|
||||||
System.out.printf("Total time: %.2fs%n", ((float) (System.currentTimeMillis() - start)) / 1000);
|
System.out.printf("Total time: %.2fs%n", ((float) (System.currentTimeMillis() - start)) / 1000);
|
||||||
}
|
}
|
||||||
|
|||||||
@ -54,7 +54,7 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
|||||||
@SneakyThrows
|
@SneakyThrows
|
||||||
public ClassificationDocument buildClassificationDocument(File originDocument, TableServiceResponse tableServiceResponse) {
|
public ClassificationDocument buildClassificationDocument(File originDocument, TableServiceResponse tableServiceResponse) {
|
||||||
|
|
||||||
return layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER,
|
return layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER_WITHOUT_DUPLICATE_PARAGRAPH,
|
||||||
originDocument,
|
originDocument,
|
||||||
new ImageServiceResponse(),
|
new ImageServiceResponse(),
|
||||||
tableServiceResponse,
|
tableServiceResponse,
|
||||||
@ -122,7 +122,7 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
|||||||
.get(0).getTextBlocks()
|
.get(0).getTextBlocks()
|
||||||
.get(0).toString()).contains(textToSearch);
|
.get(0).toString()).contains(textToSearch);
|
||||||
|
|
||||||
Document document = DocumentGraphFactory.buildDocumentGraph(LayoutParsingType.REDACT_MANAGER, classificationDocument);
|
Document document = DocumentGraphFactory.buildDocumentGraph(LayoutParsingType.REDACT_MANAGER_WITHOUT_DUPLICATE_PARAGRAPH, classificationDocument);
|
||||||
|
|
||||||
TextBlock leafTextBlock = document.getFirstPage().getHeader().getLeafTextBlock();
|
TextBlock leafTextBlock = document.getFirstPage().getHeader().getLeafTextBlock();
|
||||||
assertThat(leafTextBlock.getSearchText().contains(textToSearch)).isTrue();
|
assertThat(leafTextBlock.getSearchText().contains(textToSearch)).isTrue();
|
||||||
|
|||||||
@ -103,15 +103,15 @@ public class RulingCleaningServiceTest extends BuildDocumentTest {
|
|||||||
@SneakyThrows
|
@SneakyThrows
|
||||||
private void writeJsons(Path filename) {
|
private void writeJsons(Path filename) {
|
||||||
|
|
||||||
Document documentGraphBefore = DocumentGraphFactory.buildDocumentGraph(LayoutParsingType.REDACT_MANAGER,
|
Document documentGraphBefore = DocumentGraphFactory.buildDocumentGraph(LayoutParsingType.REDACT_MANAGER_WITHOUT_DUPLICATE_PARAGRAPH,
|
||||||
layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER,
|
layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER_WITHOUT_DUPLICATE_PARAGRAPH,
|
||||||
filename.toFile(),
|
filename.toFile(),
|
||||||
new ImageServiceResponse(),
|
new ImageServiceResponse(),
|
||||||
new TableServiceResponse(),
|
new TableServiceResponse(),
|
||||||
new VisualLayoutParsingResponse(),
|
new VisualLayoutParsingResponse(),
|
||||||
Map.of("file",filename.toFile().toString())));
|
Map.of("file",filename.toFile().toString())));
|
||||||
Document documentGraphAfter = DocumentGraphFactory.buildDocumentGraph(LayoutParsingType.REDACT_MANAGER,
|
Document documentGraphAfter = DocumentGraphFactory.buildDocumentGraph(LayoutParsingType.REDACT_MANAGER_WITHOUT_DUPLICATE_PARAGRAPH,
|
||||||
layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER,
|
layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER_WITHOUT_DUPLICATE_PARAGRAPH,
|
||||||
filename.toFile(),
|
filename.toFile(),
|
||||||
new ImageServiceResponse(),
|
new ImageServiceResponse(),
|
||||||
new TableServiceResponse(),
|
new TableServiceResponse(),
|
||||||
|
|||||||
@ -50,7 +50,7 @@ public abstract class BuildDocumentTest extends AbstractTest {
|
|||||||
|
|
||||||
|
|
||||||
if (!filename.startsWith("files") && filename.startsWith("/")) {
|
if (!filename.startsWith("files") && filename.startsWith("/")) {
|
||||||
LayoutParsingRequest layoutParsingRequest = buildDefaultLayoutParsingRequest(Path.of(filename).getFileName().toString(), LayoutParsingType.REDACT_MANAGER, true);
|
LayoutParsingRequest layoutParsingRequest = buildDefaultLayoutParsingRequest(Path.of(filename).getFileName().toString(), LayoutParsingType.REDACT_MANAGER_WITHOUT_DUPLICATE_PARAGRAPH, true);
|
||||||
prepareStorage(layoutParsingRequest, new File(filename));
|
prepareStorage(layoutParsingRequest, new File(filename));
|
||||||
return DocumentGraphFactory.buildDocumentGraph(layoutParsingType,
|
return DocumentGraphFactory.buildDocumentGraph(layoutParsingType,
|
||||||
layoutParsingPipeline.parseLayout(layoutParsingType,
|
layoutParsingPipeline.parseLayout(layoutParsingType,
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user