* fixed bug with incorrect empty cell count by adding threshhold to cell.contains

This commit is contained in:
yhampe 2023-11-15 10:44:47 +01:00
parent f69331e7d8
commit c3e69b2cdf
7 changed files with 270 additions and 38 deletions

View File

@ -190,14 +190,14 @@ public class LayoutParsingPipeline {
PDRectangle cropbox = pdPage.getCropBox(); PDRectangle cropbox = pdPage.getCropBox();
CleanRulings cleanRulings = rulingCleaningService.getCleanRulings(pdfTableCells.get(pageNumber), CleanRulings cleanRulings = rulingCleaningService.getCleanRulings(pdfTableCells.get(pageNumber),
stripper.getRulings(), stripper.getRulings(),
1, stripper.getMinCharWidth(),
1); stripper.getMaxCharHeight());
List<Rectangle> spreedSheetArea = tableExtractionService.getSpreadSheetArea(cleanRulings, layoutParsingType); List<Rectangle> spreedSheetArea = tableExtractionService.getSpreadSheetArea(cleanRulings, layoutParsingType);
Map<String,Float> newValues = calculateMinCharWidthAndMaxCharHeightInsideTable(stripper,spreedSheetArea,10f,1f); Map<String,Float> newValues = calculateMinCharWidthAndMaxCharHeightInsideTable(stripper,spreedSheetArea);
cleanRulings = rulingCleaningService.getCleanRulings(pdfTableCells.get(pageNumber), stripper.getRulings(), newValues.get("minCharWidth"), newValues.get("maxCharHeight")); cleanRulings = rulingCleaningService.getCleanRulings(pdfTableCells.get(pageNumber), stripper.getRulings(), newValues.get("minCharWidth"), newValues.get("minCharHeigth"));
ClassificationPage classificationPage = switch (layoutParsingType) { ClassificationPage classificationPage = switch (layoutParsingType) {
@ -255,17 +255,17 @@ public class LayoutParsingPipeline {
* @return Map with both values * @return Map with both values
*/ */
private Map<String, Float> calculateMinCharWidthAndMaxCharHeightInsideTable(PDFLinesTextStripper stripper, List<Rectangle> spreedSheetArea, float initialMinCharWidth, float initialMaxCharHeight) { private Map<String, Float> calculateMinCharWidthAndMaxCharHeightInsideTable(PDFLinesTextStripper stripper, List<Rectangle> spreedSheetArea) {
float newMinCharWidth = initialMinCharWidth; float newMinCharWidth = 10;
float newMaxCharHeight = initialMaxCharHeight; float newMinCharHeight = 30;
Map<String,Float> result = new HashMap<>(); Map<String,Float> result = new HashMap<>();
for(var textPositionSequence: stripper.getTextPositionSequences() ) { for(var textPositionSequence: stripper.getTextPositionSequences() ) {
for(var redTextPosition: textPositionSequence.getTextPositions()) { for(var redTextPosition: textPositionSequence.getTextPositions()) {
for(var area: spreedSheetArea) { for(var area: spreedSheetArea) {
if(area.contains(redTextPosition.getPosition()[0], redTextPosition.getPosition()[1], redTextPosition.getPosition()[2], redTextPosition.getPosition()[3])) { if(area.contains(redTextPosition.getPosition()[0], redTextPosition.getPosition()[1], redTextPosition.getPosition()[2], redTextPosition.getPosition()[3])) {
if(redTextPosition.getHeightDir() > newMaxCharHeight) { if(redTextPosition.getHeightDir() < newMinCharHeight) {
newMaxCharHeight = redTextPosition.getHeightDir(); newMinCharHeight = redTextPosition.getHeightDir();
} }
if(redTextPosition.getWidthDirAdj() < newMinCharWidth) { if(redTextPosition.getWidthDirAdj() < newMinCharWidth) {
newMinCharWidth = redTextPosition.getWidthDirAdj(); newMinCharWidth = redTextPosition.getWidthDirAdj();
@ -275,7 +275,7 @@ public class LayoutParsingPipeline {
} }
} }
result.put("minCharWidth",newMinCharWidth); result.put("minCharWidth",newMinCharWidth);
result.put("maxCharHeight",newMaxCharHeight); result.put("minCharHeigth",newMinCharHeight);
return result; return result;
} }

View File

@ -35,6 +35,7 @@ public class Table implements SemanticNode {
int numberOfRows; int numberOfRows;
int numberOfCols; int numberOfCols;
int firstpage; int firstpage;
int emptyCells;
TextBlock textBlock; TextBlock textBlock;
@Builder.Default @Builder.Default
@ -208,7 +209,6 @@ public class Table implements SemanticNode {
return IntStream.range(0, numberOfCols).boxed().map(col -> getCell(row, col)); return IntStream.range(0, numberOfCols).boxed().map(col -> getCell(row, col));
} }
/** /**
* Streams all TableCells row-wise and filters them with header == true. * Streams all TableCells row-wise and filters them with header == true.
* *

View File

@ -252,7 +252,7 @@ public class TablePageBlock extends AbstractPageBlock {
if (prevY != null && prevX != null) { if (prevY != null && prevX != null) {
var cell = new Cell(new Point2D.Float(prevX, prevY), new Point2D.Float(x, y)); var cell = new Cell(new Point2D.Float(prevX, prevY), new Point2D.Float(x, y));
var intersectionCell = cells.stream().filter(c -> cell.intersects(c) && cell.overlapRatio(c) > 0.1f).findFirst(); var intersectionCell = cells.stream().filter(c -> cell.intersects(c)).findFirst();
intersectionCell.ifPresent(value -> cell.getTextBlocks().addAll(value.getTextBlocks())); intersectionCell.ifPresent(value -> cell.getTextBlocks().addAll(value.getTextBlocks()));
if (cell.hasMinimumSize()) { if (cell.hasMinimumSize()) {
row.add(cell); row.add(cell);

View File

@ -1,6 +1,9 @@
package com.knecon.fforesight.service.layoutparser.processor.services; package com.knecon.fforesight.service.layoutparser.processor.services;
import java.awt.geom.Point2D; import java.awt.geom.Point2D;
import java.io.FileOutputStream;
import java.io.IOException;
import java.nio.file.Path;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.Comparator; import java.util.Comparator;
import java.util.HashMap; import java.util.HashMap;
@ -10,6 +13,8 @@ import java.util.List;
import java.util.Map; import java.util.Map;
import java.util.Set; import java.util.Set;
import org.apache.pdfbox.Loader;
import org.springframework.core.io.ClassPathResource;
import org.springframework.stereotype.Service; import org.springframework.stereotype.Service;
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType; import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType;
@ -66,6 +71,20 @@ public class TableExtractionService {
}; };
public boolean contains(Cell cell, double x, double y, double w, double h) {
if (cell.isEmpty() || w <= 0 || h <= 0) {
return false;
}
double x0 = cell.getX();
double y0 = cell.getY();
return (x >= x0-2 &&
y >= y0-2 &&
(x + w) <= x0 + cell.getWidth()+2 &&
(y + h) <= y0 + cell.getHeight()+2);
}
/** /**
* Finds tables on a page and moves textblocks into cells of the found tables. * Finds tables on a page and moves textblocks into cells of the found tables.
* Note: This algorithm uses Pdf Coordinate System where {0,0} rotated with the page rotation. * Note: This algorithm uses Pdf Coordinate System where {0,0} rotated with the page rotation.
@ -84,13 +103,12 @@ public class TableExtractionService {
List<Cell> cells = findCells(cleanRulings.getHorizontal(), cleanRulings.getVertical(), layoutParsingType); List<Cell> cells = findCells(cleanRulings.getHorizontal(), cleanRulings.getVertical(), layoutParsingType);
List<TextPageBlock> toBeRemoved = new ArrayList<>(); List<TextPageBlock> toBeRemoved = new ArrayList<>();
for (AbstractPageBlock abstractPageBlock : page.getTextBlocks()) { for (AbstractPageBlock abstractPageBlock : page.getTextBlocks()) {
TextPageBlock textBlock = (TextPageBlock) abstractPageBlock; TextPageBlock textBlock = (TextPageBlock) abstractPageBlock;
for (Cell cell : cells) { for (Cell cell : cells) {
if (cell.hasMinimumSize() && cell.intersects(textBlock.getPdfMinX(), if (cell.hasMinimumSize() && contains(cell, textBlock.getPdfMinX(),
textBlock.getPdfMinY(), textBlock.getPdfMinY(),
textBlock.getPdfMaxX() - textBlock.getPdfMinX(), textBlock.getPdfMaxX() - textBlock.getPdfMinX(),
textBlock.getPdfMaxY() - textBlock.getPdfMinY())) { textBlock.getPdfMaxY() - textBlock.getPdfMinY())) {
@ -104,7 +122,7 @@ public class TableExtractionService {
cells = new ArrayList<>(new HashSet<>(cells)); cells = new ArrayList<>(new HashSet<>(cells));
DoubleComparisons.sort(cells, Rectangle.ILL_DEFINED_ORDER); DoubleComparisons.sort(cells, Rectangle.ILL_DEFINED_ORDER);
List<Rectangle> spreadsheetAreas = findSpreadsheetsFromCells(cells).stream().filter(r -> r.getWidth() > 0f && r.getHeight() > 0f).toList(); List<Rectangle> spreadsheetAreas = findSpreadsheetsFromCells(cells);
List<TablePageBlock> tables = new ArrayList<>(); List<TablePageBlock> tables = new ArrayList<>();
for (Rectangle area : spreadsheetAreas) { for (Rectangle area : spreadsheetAreas) {
@ -131,9 +149,18 @@ public class TableExtractionService {
if (position != -1) { if (position != -1) {
page.getTextBlocks().add(position, table); page.getTextBlocks().add(position, table);
} }
String tmpFileName = "C:/Users/YANNIK~1/AppData/Local/Temp/page1.tables.html";
try (FileOutputStream fileOutputStream = new FileOutputStream(Path.of(tmpFileName).toFile())) {
fileOutputStream.write(table.getTextAsHtml().getBytes());
}
catch (IOException e) {
throw new RuntimeException(e);
}
} }
page.getTextBlocks().removeAll(toBeRemoved); page.getTextBlocks().removeAll(toBeRemoved);
} }
public List<Rectangle> getSpreadSheetArea(CleanRulings cleanRulings, LayoutParsingType layoutParsingType) { public List<Rectangle> getSpreadSheetArea(CleanRulings cleanRulings, LayoutParsingType layoutParsingType) {

View File

@ -2,14 +2,31 @@ package com.knecon.fforesight.service.layoutparser.server.graph;
import java.io.FileOutputStream; import java.io.FileOutputStream;
import java.nio.file.Path; import java.nio.file.Path;
import java.util.List;
import org.apache.pdfbox.Loader; import org.apache.pdfbox.Loader;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.junit.jupiter.api.Disabled; import org.junit.jupiter.api.Disabled;
import org.junit.jupiter.api.Test; import org.junit.jupiter.api.Test;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.core.io.ClassPathResource; import org.springframework.core.io.ClassPathResource;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentData;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentStructure;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeType;
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType; import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType;
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationDocument;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document; import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Table;
import com.knecon.fforesight.service.layoutparser.processor.model.table.Cell;
import com.knecon.fforesight.service.layoutparser.processor.model.table.TablePageBlock;
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.image.ImageServiceResponse;
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableServiceResponse;
import com.knecon.fforesight.service.layoutparser.processor.services.SectionsBuilderService;
import com.knecon.fforesight.service.layoutparser.processor.services.classification.RedactManagerClassificationService;
import com.knecon.fforesight.service.layoutparser.processor.services.factory.DocumentGraphFactory;
import com.knecon.fforesight.service.layoutparser.processor.services.mapper.DocumentDataMapper;
import com.knecon.fforesight.service.layoutparser.processor.services.mapper.PropertiesMapper;
import com.knecon.fforesight.service.layoutparser.processor.services.visualization.LayoutGridService; import com.knecon.fforesight.service.layoutparser.processor.services.visualization.LayoutGridService;
import com.knecon.fforesight.service.layoutparser.processor.services.visualization.ViewerDocumentService; import com.knecon.fforesight.service.layoutparser.processor.services.visualization.ViewerDocumentService;
import com.knecon.fforesight.service.layoutparser.server.utils.BuildDocumentTest; import com.knecon.fforesight.service.layoutparser.server.utils.BuildDocumentTest;
@ -18,19 +35,80 @@ import lombok.SneakyThrows;
public class ViewerDocumentTest extends BuildDocumentTest { public class ViewerDocumentTest extends BuildDocumentTest {
@Autowired
private SectionsBuilderService sectionsBuilderService;
@Autowired
private RedactManagerClassificationService redactManagerClassificationService;
@Test @Test
@Disabled
@SneakyThrows @SneakyThrows
public void testViewerDocument() { public void testViewerDocument() {
String fileName = "files/2Tables.pdf";
String tmpFileName = "C:/Users/YANNIK~1/AppData/Local/Temp/2Tables.lines.pdf";
LayoutGridService layoutGridService = new LayoutGridService(); LayoutGridService layoutGridService = new LayoutGridService();
ViewerDocumentService viewerDocumentService = new ViewerDocumentService(layoutGridService); ViewerDocumentService viewerDocumentService = new ViewerDocumentService(layoutGridService);
String fileName = "files/bdr/notMergedParagraphs.pdf";
Document document = buildGraph(fileName, LayoutParsingType.TAAS); Document document = buildGraph(fileName, LayoutParsingType.TAAS);
String tmpFileName = "/tmp/" + Path.of(fileName).getFileName() + "_VIEWER.pdf";
try (var pdDocument = Loader.loadPDF(new ClassPathResource(fileName).getFile()); var out = new FileOutputStream(tmpFileName)) { try (var pdDocument = Loader.loadPDF(new ClassPathResource(fileName).getFile()); var out = new FileOutputStream(tmpFileName)) {
viewerDocumentService.createViewerDocument(pdDocument, document, out, true); viewerDocumentService.createViewerDocument(pdDocument, document, out, true);
} }
} }
@Test
@SneakyThrows
public void testTableViewerDocument() {
String fileName = "C:\\Users\\YannikHampe\\repos\\layout-parser\\layoutparser-service\\layoutparser-service-server\\src\\test\\resources\\files\\SinglePages\\VV-931175_Page1.pdf";
String tmpFileName = "C:/Users/YANNIK~1/AppData/Local/Temp/page1.lines.pdf";
Document documentGraph = DocumentGraphFactory.buildDocumentGraph(layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER,
Loader.loadPDF(Path.of(fileName).toFile()),
new ImageServiceResponse(),
new TableServiceResponse()));
LayoutGridService layoutGridService = new LayoutGridService();
ViewerDocumentService viewerDocumentService = new ViewerDocumentService(layoutGridService);
try (var pdDocument = Loader.loadPDF(Path.of(fileName).toFile()); var out = new FileOutputStream(tmpFileName)) {
viewerDocumentService.createViewerDocument(pdDocument, documentGraph, out, true);
}
//durch rows
DocumentData documentData = DocumentDataMapper.toDocumentData(documentGraph);
int emptyCellCount = 0;
List listStructure2 = documentData.getDocumentStructure()
.streamAllEntries()
.filter(entryData -> entryData.getType().equals(NodeType.TABLE))
.map(DocumentStructure.EntryData::getProperties)
.map(properties -> {
var builder = Table.builder();
PropertiesMapper.parseTableProperties(properties, builder);
return builder.build();
}).toList();
for(int i = 0; i < listStructure2.size(); i++) {
emptyCellCount = ((Table) listStructure2.get(i)).getEmptyCells();
}
System.out.println("Empty cells "+emptyCellCount);
ClassificationDocument document = buildClassificationDocument(Loader.loadPDF(Path.of(fileName).toFile()));
TablePageBlock table = document.getSections().stream().flatMap(paragraph -> paragraph.getTables().stream()).toList().get(0);
int emptyCellsFoundFound = table.getRows().stream().flatMap(List::stream).toList().stream().filter(f -> f.toString().equals("")).toList().size();
for (List<Cell> row : table.getRows()) {
System.out.println(row.toString());
}
System.out.println("Actual number of empty rows: "+emptyCellsFoundFound);
}
public ClassificationDocument buildClassificationDocument(PDDocument originDocument) {
ClassificationDocument classificationDocument = layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER,
originDocument,
new ImageServiceResponse(),
new TableServiceResponse());
redactManagerClassificationService.classifyDocument(classificationDocument);
sectionsBuilderService.buildSections(classificationDocument);
return classificationDocument;
}
} }

View File

@ -29,6 +29,8 @@ import java.util.stream.Collectors;
import static org.assertj.core.api.Assertions.assertThat; import static org.assertj.core.api.Assertions.assertThat;
import javax.sound.midi.SysexMessage;
public class PdfSegmentationServiceTest extends AbstractTest { public class PdfSegmentationServiceTest extends AbstractTest {
@Autowired @Autowired
@ -52,7 +54,7 @@ public class PdfSegmentationServiceTest extends AbstractTest {
public ClassificationDocument buildClassificationDocument(PDDocument originDocument) { public ClassificationDocument buildClassificationDocument(PDDocument originDocument) {
ClassificationDocument classificationDocument = layoutParsingPipeline.parseLayout(LayoutParsingType.DOCUMINE, ClassificationDocument classificationDocument = layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER,
originDocument, originDocument,
new ImageServiceResponse(), new ImageServiceResponse(),
new TableServiceResponse()); new TableServiceResponse());
@ -166,8 +168,8 @@ public class PdfSegmentationServiceTest extends AbstractTest {
validateTable(document, 0, 1, 1, 0, 0); validateTable(document, 0, 1, 1, 0, 0);
validateTable(document, 1, 2, 2, 0, 0); validateTable(document, 1, 2, 2, 0, 0);
validateTable(document, 2, 7, 20, 0, 140); validateTable(document, 2, 7, 20, 0, 0);
validateTable(document, 3, 8, 31, 0, 170); validateTable(document, 3, 8, 31, 0, 0);
} }
@ -181,7 +183,7 @@ public class PdfSegmentationServiceTest extends AbstractTest {
validateTableSize(document, 1); validateTableSize(document, 1);
validateTable(document, 0, 8, 8, 0, 2); validateTable(document, 0, 8, 8, 0, 0);
List<List<String>> values = Arrays.asList(Arrays.asList("Annex point Reference within DAR/RAR", List<List<String>> values = Arrays.asList(Arrays.asList("Annex point Reference within DAR/RAR",
"Author, date", "Author, date",
@ -191,18 +193,18 @@ public class PdfSegmentationServiceTest extends AbstractTest {
"Method meets analytical validation criteria", "Method meets analytical validation criteria",
"Remarks (in case validation criteria are not met)", "Remarks (in case validation criteria are not met)",
"Acceptability of the method"), "Acceptability of the method"),
Arrays.asList("", Arrays.asList("Part (a) Methods in soil, water, sediment, air and any additional matrices used in support of environmental fate studies",
"Part (a) Methods in soil, water, sediment, air and any additional matrices used in support of environmental fate studies",
"Part (a) Methods in soil, water, sediment, air and any additional matrices used in support of environmental fate studies", "Part (a) Methods in soil, water, sediment, air and any additional matrices used in support of environmental fate studies",
"Part (a) Methods in soil, water, sediment, air and any additional matrices used in support of environmental fate studies", "Part (a) Methods in soil, water, sediment, air and any additional matrices used in support of environmental fate studies",
"Part (a) Methods in soil, water, sediment, air and any additional matrices used in support of environmental fate studies", "Part (a) Methods in soil, water, sediment, air and any additional matrices used in support of environmental fate studies",
"Part (a) Methods in soil, water, sediment, air and any additional matrices used in support of environmental fate studies", "Part (a) Methods in soil, water, sediment, air and any additional matrices used in support of environmental fate studies",
"",
"Part (a) Methods in soil, water, sediment, air and any additional matrices used in support of environmental fate studies", "Part (a) Methods in soil, water, sediment, air and any additional matrices used in support of environmental fate studies",
"Part (a) Methods in soil, water, sediment, air and any additional matrices used in support of environmental fate studies"), "Part (a) Methods in soil, water, sediment, air and any additional matrices used in support of environmental fate studies"),
Arrays.asList("CA 7.1.2.1.1 DAR (2009)", Arrays.asList("CA 7.1.2.1.1 DAR (2009)",
"Evans P.G. 2001 TMJ4569B, VV-323245", "Evans P.G. 2001 TMJ4569B, VV-323245",
"Azoxystrobin Laboratory Degradation Study in Three Soil Types, Sampled from Holland and the United Kingdom", "Azoxystrobin Laboratory Degradation Study in Three Soil Types, Sampled from Holland and the United Kingdom",
"Method: RAM 269 Johnson R.I., Tummon O.J., Earl M. 1995 RJ1864B, VV-377731 Johnson R.I., Tummon O.J., Earl M. 1998 RAM 269/02, VV-124072 Johnson R.I., Tummon O.J., Earl M. 2000 RAM 269/03, VV-123986 Validation: Robinson N.J. 2001 TMJ4617B, VV-895845 in a Trial Carried", "Method: RAM 269 Johnson R.I., Tummon O.J., Earl M. 1995 RJ1864B, VV-377731 Johnson R.I., Tummon O.J., Earl M. 1998 RAM 269/02, VV-124072 Johnson R.I., Tummon O.J., Earl M. 2000 RAM 269/03, VV-123986 Validation: Robinson N.J. 2001 TMJ4617B, VV-895845",
"LC-MS/MS LOQ: 0.01 mg/kg (R401553 (SYN50165 7), R402173 (SYN501114 )) or 0.02mg/kg (azoxystrobin, R230310, R234886) Working range: 0.02-1.0 or 0.01-0.5 mg/kg (depending on analyte) Other supporting quantificati on methods: HPLC-UV GC-MSD", "LC-MS/MS LOQ: 0.01 mg/kg (R401553 (SYN50165 7), R402173 (SYN501114 )) or 0.02mg/kg (azoxystrobin, R230310, R234886) Working range: 0.02-1.0 or 0.01-0.5 mg/kg (depending on analyte) Other supporting quantificati on methods: HPLC-UV GC-MSD",
"Y", "Y",
"N/A", "N/A",
@ -239,8 +241,8 @@ public class PdfSegmentationServiceTest extends AbstractTest {
validateTableSize(document, 2); validateTableSize(document, 2);
validateTable(document, 0, 5, 5, 0, 23); validateTable(document, 0, 5, 5, 0, 0);
validateTable(document, 1, 11, 9, 0, 36); validateTable(document, 1, 11, 9, 0, 0);
} }
@ -328,7 +330,7 @@ public class PdfSegmentationServiceTest extends AbstractTest {
ClassificationDocument document = buildClassificationDocument(Loader.loadPDF(pdfFileResource.getFile())); ClassificationDocument document = buildClassificationDocument(Loader.loadPDF(pdfFileResource.getFile()));
validateTableSize(document, 1); validateTableSize(document, 1);
validateTable(document, 0, 10, 6, 0, 1); validateTable(document, 0, 10, 6, 0, 0);
} }
@ -450,8 +452,8 @@ public class PdfSegmentationServiceTest extends AbstractTest {
validateTableSize(document, 2); validateTableSize(document, 2);
validateTable(document, 0, 6, 8, 0, 2); validateTable(document, 0, 6, 8, 0, 0);
validateTable(document, 1, 6, 8, 0, 1); validateTable(document, 1, 6, 8, 0, 0);
} }
@ -465,7 +467,7 @@ public class PdfSegmentationServiceTest extends AbstractTest {
validateTableSize(document, 1); validateTableSize(document, 1);
validateTable(document, 0, 9, 5, 2, 0); validateTable(document, 0, 9, 5, 0, 0);
} }
@ -490,6 +492,9 @@ public class PdfSegmentationServiceTest extends AbstractTest {
List<List<Cell>> rows = table.getRows(); List<List<Cell>> rows = table.getRows();
int emptyCellsFoundFound = rows.stream().flatMap(List::stream).toList().stream().filter(f -> f.toString().equals("")).toList().size(); int emptyCellsFoundFound = rows.stream().flatMap(List::stream).toList().stream().filter(f -> f.toString().equals("")).toList().size();
for (List<Cell> row : table.getRows()) {
row.forEach(r -> System.out.println(r.toString()));
}
assertThat(emptyCellsFoundFound).isEqualTo(emptyCellsCountCorrect + emptyCellsCountIncorrect); assertThat(emptyCellsFoundFound).isEqualTo(emptyCellsCountCorrect + emptyCellsCountIncorrect);
assertThat(table.getColCount()).isEqualTo(colCount); assertThat(table.getColCount()).isEqualTo(colCount);

View File

@ -1,37 +1,159 @@
package com.knecon.fforesight.service.layoutparser.server.services; package com.knecon.fforesight.service.layoutparser.server.services;
import java.io.File;
import java.io.FileOutputStream;
import java.nio.file.Files;
import java.nio.file.Path; import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections; import java.util.Collections;
import java.util.LinkedList; import java.util.LinkedList;
import java.util.List; import java.util.List;
import org.junit.jupiter.api.Test; import javax.print.Doc;
import org.apache.pdfbox.Loader;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.junit.jupiter.api.Test;
import org.springframework.core.io.ClassPathResource;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.iqser.red.commons.jackson.ObjectMapperFactory;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentData;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentStructure;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeType;
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType;
import com.knecon.fforesight.service.layoutparser.processor.model.PageContents; import com.knecon.fforesight.service.layoutparser.processor.model.PageContents;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Table;
import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings; import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings;
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.image.ImageServiceResponse;
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableServiceResponse;
import com.knecon.fforesight.service.layoutparser.processor.services.PageContentExtractor; import com.knecon.fforesight.service.layoutparser.processor.services.PageContentExtractor;
import com.knecon.fforesight.service.layoutparser.processor.services.RulingCleaningService; import com.knecon.fforesight.service.layoutparser.processor.services.RulingCleaningService;
import com.knecon.fforesight.service.layoutparser.processor.services.factory.DocumentGraphFactory;
import com.knecon.fforesight.service.layoutparser.processor.services.mapper.DocumentDataMapper;
import com.knecon.fforesight.service.layoutparser.processor.services.mapper.PropertiesMapper;
import com.knecon.fforesight.service.layoutparser.processor.services.parsing.PDFLinesTextStripper;
import com.knecon.fforesight.service.layoutparser.processor.services.visualization.LayoutGridService;
import com.knecon.fforesight.service.layoutparser.processor.services.visualization.ViewerDocumentService;
import com.knecon.fforesight.service.layoutparser.server.utils.BuildDocumentTest;
import com.knecon.fforesight.service.layoutparser.server.utils.visualizations.PdfDraw; import com.knecon.fforesight.service.layoutparser.server.utils.visualizations.PdfDraw;
import lombok.SneakyThrows; import lombok.SneakyThrows;
public class RulingCleaningServiceTest { public class RulingCleaningServiceTest extends BuildDocumentTest {
@Test @Test
// @Disabled // @Disabled
@SneakyThrows @SneakyThrows
public void textRulingExtraction() { public void textRulingExtraction() {
String fileName = "files/211.pdf"; String fileName = "/files/102 S-Metolachlor_RAR_02_Volume_2_2018-09-06.pdf";
String lineFileName = "/tmp/" + Path.of(fileName).getFileName().toString() + "_LINES.pdf"; String lineFileName = "C:/Users/YANNIK~1/AppData/Local/Temp/102 S-Metolachlor_RAR_02_Volume_2_2018-09-06.after.pdf";
List<PageContents> pageContents = PageContentExtractor.getSortedPageContents(fileName); List<PageContents> pageContents = PageContentExtractor.getSortedPageContents(fileName);
PdfDraw.drawLinesPerPage(fileName, pageContents.stream().map(PageContents::getRulings).toList(), lineFileName);
RulingCleaningService rulingCleaningService = new RulingCleaningService(); RulingCleaningService rulingCleaningService = new RulingCleaningService();
List<CleanRulings> cleanRulingsPerPage = new LinkedList<>(); List<CleanRulings> cleanRulingsPerPage = new LinkedList<>();
writeJsons(Path.of(fileName));
for (PageContents pageContent : pageContents) { for (PageContents pageContent : pageContents) {
cleanRulingsPerPage.add(rulingCleaningService.getCleanRulings(Collections.emptyList(), pageContent.getRulings(), 8, 20)); cleanRulingsPerPage.add(rulingCleaningService.getCleanRulings(Collections.emptyList(), pageContent.getRulings(), 8, 1));
}
PdfDraw.drawLinesPerPage(fileName, pageContents.stream().map(PageContents::getRulings).toList(), lineFileName);
}
@Test
@SneakyThrows
public void testTableExtractionSingle() {
String filename ="C:\\Users\\YannikHampe\\repos\\layout-parser\\layoutparser-service\\layoutparser-service-server\\src\\test\\resources\\files\\SinglePages\\24 - SYN549522 - Acute Oral Toxicity - Rats_Page17.pdf";
writeJsons(Path.of(filename));
}
@Test
@SneakyThrows
public void testTableExtraction() {
LayoutGridService layoutGridService = new LayoutGridService();
ViewerDocumentService viewerDocumentService = new ViewerDocumentService(layoutGridService);
ClassPathResource resource = new ClassPathResource("files");
List<String> pdfFileNames = Files.walk(resource.getFile().toPath())
.filter(path -> path.getFileName().toString().endsWith(".pdf"))
.map(Path::toAbsolutePath)
.map(Path::toString)
.toList();
for (int i = 0; i < pdfFileNames.size(); i++) {
writeJsons(Path.of(pdfFileNames.get(i)));
} }
} }
@SneakyThrows
private void writeJsons(Path filename) {
Document documentGraphBefore = DocumentGraphFactory.buildDocumentGraph(layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER,
Loader.loadPDF(filename.toFile()),
new ImageServiceResponse(),
new TableServiceResponse()));
Document documentGraphAfter = DocumentGraphFactory.buildDocumentGraph(layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER,
Loader.loadPDF(filename.toFile()),
new ImageServiceResponse(),
new TableServiceResponse()));
DocumentData documentDataBefore = DocumentDataMapper.toDocumentData(documentGraphBefore);
DocumentData documentDataAfter = DocumentDataMapper.toDocumentData(documentGraphAfter);
if(!compareStructures(documentDataBefore.getDocumentStructure(), documentDataAfter.getDocumentStructure(), filename.getFileName().toString())) {
String tmpFileNameBefore = "C:/Users/YANNIK~1/AppData/Local/Temp/before."+filename.getFileName().toString();;
System.out.println(tmpFileNameBefore);
try (PDDocument pdDocument = Loader.loadPDF(filename.toFile())) {
PdfDraw.drawDocumentGraph(pdDocument, documentGraphBefore);
pdDocument.save(tmpFileNameBefore);
}
String tmpFileNameAfter = "C:/Users/YANNIK~1/AppData/Local/Temp/after."+filename.getFileName().toString();;
System.out.println(tmpFileNameAfter);
try (PDDocument pdDocument = Loader.loadPDF(filename.toFile())) {
PdfDraw.drawDocumentGraph(pdDocument, documentGraphAfter);
pdDocument.save(tmpFileNameAfter);
}
}
}
@SneakyThrows
private boolean compareStructures(DocumentStructure structure1, DocumentStructure structure2, String pdfName) {
List listStructure1 = structure1
.streamAllEntries()
.filter(entryData -> entryData.getType().equals(NodeType.TABLE))
.map(DocumentStructure.EntryData::getProperties)
.map(properties -> {
var builder = Table.builder();
PropertiesMapper.parseTableProperties(properties, builder);
return builder.build();
}).toList();
List listStructure2 = structure2
.streamAllEntries()
.filter(entryData -> entryData.getType().equals(NodeType.TABLE))
.map(DocumentStructure.EntryData::getProperties)
.map(properties -> {
var builder = Table.builder();
PropertiesMapper.parseTableProperties(properties, builder);
return builder.build();
}).toList();
for(int i = 0; i < listStructure1.size(); i++) {
Table tableNode1 = (Table) listStructure1.get(i);
Table tableNode2 = (Table) listStructure2.get(i);
if(tableNode1.getNumberOfRows() != tableNode2.getNumberOfRows() || tableNode1.getNumberOfCols() != tableNode2.getNumberOfCols()) {
return false;
}
}
return true;
}
} }