DM-589: Filter wrong detected cells that borders from rotation at scanning

This commit is contained in:
Dominique Eifländer 2023-11-20 15:54:02 +01:00
parent 144a9591a2
commit dacc2f7f43
4 changed files with 57 additions and 12 deletions

View File

@ -8,6 +8,7 @@ import java.util.Map;
import org.springframework.stereotype.Service;
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.PageInfo;
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableCells;
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableServiceResponse;
@ -24,24 +25,26 @@ public class CvTableParsingAdapter {
Map<Integer, List<TableCells>> tableCells = new HashMap<>();
tableServiceResponse.getData()
.forEach(tableData -> tableCells.computeIfAbsent(tableData.getPageInfo().getNumber(), tableCell -> new ArrayList<>())
.addAll(convertTableCells(tableData.getTableCells())));
.addAll(convertTableCells(tableData.getTableCells(), tableData.getPageInfo())));
return tableCells;
}
private Collection<TableCells> convertTableCells(List<TableCells> tableCells) {
private Collection<TableCells> convertTableCells(List<TableCells> tableCells, PageInfo pageInfo) {
List<TableCells> cvParsedTableCells = new ArrayList<>();
tableCells.forEach(t -> cvParsedTableCells.add(TableCells.builder()
.y0(t.getY0())
.x1(t.getX1())
.y1(t.getY1())
.x0(t.getX0())
.width(t.getWidth())
.height(t.getHeight())
.build()));
tableCells.stream()
.filter(cell -> cell.getWidth() < pageInfo.getWidth() * 0.98 && cell.getHeight() < pageInfo.getHeight() * 0.98)
.forEach(t -> cvParsedTableCells.add(TableCells.builder()
.y0(t.getY0())
.x1(t.getX1())
.y1(t.getY1())
.x0(t.getX0())
.width(t.getWidth())
.height(t.getHeight())
.build()));
return cvParsedTableCells;
}

View File

@ -61,12 +61,13 @@ public class PdfSegmentationServiceTest extends AbstractTest {
private SectionsBuilderService sectionsBuilderService;
public ClassificationDocument buildClassificationDocument(PDDocument originDocument) {
@SneakyThrows
public ClassificationDocument buildClassificationDocument(PDDocument originDocument, TableServiceResponse tableServiceResponse) {
ClassificationDocument classificationDocument = layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER,
originDocument,
new ImageServiceResponse(),
new TableServiceResponse());
tableServiceResponse);
redactManagerClassificationService.classifyDocument(classificationDocument);
@ -76,6 +77,13 @@ public class PdfSegmentationServiceTest extends AbstractTest {
}
@SneakyThrows
public ClassificationDocument buildClassificationDocument(PDDocument originDocument) {
return buildClassificationDocument(originDocument, new TableServiceResponse());
}
@Test
public void tablesToHtmlDebugger() throws IOException {
@ -88,6 +96,39 @@ public class PdfSegmentationServiceTest extends AbstractTest {
}
@Test
public void tablesToHtmlDebuggerWithCVResponse() throws IOException {
ClassPathResource pdfFileResource = new ClassPathResource("files/cv_tables/ScanRotationBorder.pdf");
ClassPathResource cvTablesResource = new ClassPathResource("files/cv_tables/ScanRotationBorder.TABLES.json");
var tableServiceResponse = objectMapper.readValue(cvTablesResource.getInputStream(), TableServiceResponse.class);
ClassificationDocument document = buildClassificationDocument(Loader.loadPDF(pdfFileResource.getFile()), tableServiceResponse);
toHtml(document, "/tmp/ScanRotationBorder.html");
}
@Test
public void testScanRotationBorderIsIgnored() throws IOException {
ClassPathResource pdfFileResource = new ClassPathResource("files/cv_tables/ScanRotationBorder.pdf");
ClassPathResource cvTablesResource = new ClassPathResource("files/cv_tables/ScanRotationBorder.TABLES.json");
var tableServiceResponse = objectMapper.readValue(cvTablesResource.getInputStream(), TableServiceResponse.class);
ClassificationDocument document = buildClassificationDocument(Loader.loadPDF(pdfFileResource.getFile()), tableServiceResponse);
assertThat(document.getSections().stream().flatMap(paragraph -> paragraph.getTables().stream()).collect(Collectors.toList())).isNotEmpty();
var tables = document.getSections().stream().flatMap(paragraph -> paragraph.getTables().stream()).toList();
// Quality of the table parsing is not good, because the file is rotated at scanning.
// We only asset that the table border is not the page border.
tables.forEach(table -> {
assertThat(table.getMinX()).isGreaterThan(75);
assertThat(table.getMaxX()).isLessThan(512);
});
}
@Test
@SneakyThrows
public void testMapping() {