DM-589: Filter wrong detected cells that borders from rotation at scanning
This commit is contained in:
parent
144a9591a2
commit
dacc2f7f43
@ -8,6 +8,7 @@ import java.util.Map;
|
||||
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.PageInfo;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableCells;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableServiceResponse;
|
||||
|
||||
@ -24,24 +25,26 @@ public class CvTableParsingAdapter {
|
||||
Map<Integer, List<TableCells>> tableCells = new HashMap<>();
|
||||
tableServiceResponse.getData()
|
||||
.forEach(tableData -> tableCells.computeIfAbsent(tableData.getPageInfo().getNumber(), tableCell -> new ArrayList<>())
|
||||
.addAll(convertTableCells(tableData.getTableCells())));
|
||||
.addAll(convertTableCells(tableData.getTableCells(), tableData.getPageInfo())));
|
||||
|
||||
return tableCells;
|
||||
}
|
||||
|
||||
|
||||
private Collection<TableCells> convertTableCells(List<TableCells> tableCells) {
|
||||
private Collection<TableCells> convertTableCells(List<TableCells> tableCells, PageInfo pageInfo) {
|
||||
|
||||
List<TableCells> cvParsedTableCells = new ArrayList<>();
|
||||
|
||||
tableCells.forEach(t -> cvParsedTableCells.add(TableCells.builder()
|
||||
.y0(t.getY0())
|
||||
.x1(t.getX1())
|
||||
.y1(t.getY1())
|
||||
.x0(t.getX0())
|
||||
.width(t.getWidth())
|
||||
.height(t.getHeight())
|
||||
.build()));
|
||||
tableCells.stream()
|
||||
.filter(cell -> cell.getWidth() < pageInfo.getWidth() * 0.98 && cell.getHeight() < pageInfo.getHeight() * 0.98)
|
||||
.forEach(t -> cvParsedTableCells.add(TableCells.builder()
|
||||
.y0(t.getY0())
|
||||
.x1(t.getX1())
|
||||
.y1(t.getY1())
|
||||
.x0(t.getX0())
|
||||
.width(t.getWidth())
|
||||
.height(t.getHeight())
|
||||
.build()));
|
||||
|
||||
return cvParsedTableCells;
|
||||
}
|
||||
|
||||
@ -61,12 +61,13 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
||||
private SectionsBuilderService sectionsBuilderService;
|
||||
|
||||
|
||||
public ClassificationDocument buildClassificationDocument(PDDocument originDocument) {
|
||||
@SneakyThrows
|
||||
public ClassificationDocument buildClassificationDocument(PDDocument originDocument, TableServiceResponse tableServiceResponse) {
|
||||
|
||||
ClassificationDocument classificationDocument = layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER,
|
||||
originDocument,
|
||||
new ImageServiceResponse(),
|
||||
new TableServiceResponse());
|
||||
tableServiceResponse);
|
||||
|
||||
redactManagerClassificationService.classifyDocument(classificationDocument);
|
||||
|
||||
@ -76,6 +77,13 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
public ClassificationDocument buildClassificationDocument(PDDocument originDocument) {
|
||||
|
||||
return buildClassificationDocument(originDocument, new TableServiceResponse());
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
public void tablesToHtmlDebugger() throws IOException {
|
||||
|
||||
@ -88,6 +96,39 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
public void tablesToHtmlDebuggerWithCVResponse() throws IOException {
|
||||
|
||||
ClassPathResource pdfFileResource = new ClassPathResource("files/cv_tables/ScanRotationBorder.pdf");
|
||||
ClassPathResource cvTablesResource = new ClassPathResource("files/cv_tables/ScanRotationBorder.TABLES.json");
|
||||
var tableServiceResponse = objectMapper.readValue(cvTablesResource.getInputStream(), TableServiceResponse.class);
|
||||
|
||||
ClassificationDocument document = buildClassificationDocument(Loader.loadPDF(pdfFileResource.getFile()), tableServiceResponse);
|
||||
|
||||
toHtml(document, "/tmp/ScanRotationBorder.html");
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
public void testScanRotationBorderIsIgnored() throws IOException {
|
||||
|
||||
ClassPathResource pdfFileResource = new ClassPathResource("files/cv_tables/ScanRotationBorder.pdf");
|
||||
ClassPathResource cvTablesResource = new ClassPathResource("files/cv_tables/ScanRotationBorder.TABLES.json");
|
||||
var tableServiceResponse = objectMapper.readValue(cvTablesResource.getInputStream(), TableServiceResponse.class);
|
||||
|
||||
ClassificationDocument document = buildClassificationDocument(Loader.loadPDF(pdfFileResource.getFile()), tableServiceResponse);
|
||||
assertThat(document.getSections().stream().flatMap(paragraph -> paragraph.getTables().stream()).collect(Collectors.toList())).isNotEmpty();
|
||||
var tables = document.getSections().stream().flatMap(paragraph -> paragraph.getTables().stream()).toList();
|
||||
|
||||
// Quality of the table parsing is not good, because the file is rotated at scanning.
|
||||
// We only asset that the table border is not the page border.
|
||||
tables.forEach(table -> {
|
||||
assertThat(table.getMinX()).isGreaterThan(75);
|
||||
assertThat(table.getMaxX()).isLessThan(512);
|
||||
});
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
@SneakyThrows
|
||||
public void testMapping() {
|
||||
|
||||
File diff suppressed because one or more lines are too long
Binary file not shown.
Loading…
x
Reference in New Issue
Block a user