* fixed bug with incorrect empty cell count by adding threshhold to cell.contains
This commit is contained in:
parent
f69331e7d8
commit
c3e69b2cdf
@ -190,14 +190,14 @@ public class LayoutParsingPipeline {
|
|||||||
PDRectangle cropbox = pdPage.getCropBox();
|
PDRectangle cropbox = pdPage.getCropBox();
|
||||||
CleanRulings cleanRulings = rulingCleaningService.getCleanRulings(pdfTableCells.get(pageNumber),
|
CleanRulings cleanRulings = rulingCleaningService.getCleanRulings(pdfTableCells.get(pageNumber),
|
||||||
stripper.getRulings(),
|
stripper.getRulings(),
|
||||||
1,
|
stripper.getMinCharWidth(),
|
||||||
1);
|
stripper.getMaxCharHeight());
|
||||||
|
|
||||||
List<Rectangle> spreedSheetArea = tableExtractionService.getSpreadSheetArea(cleanRulings, layoutParsingType);
|
List<Rectangle> spreedSheetArea = tableExtractionService.getSpreadSheetArea(cleanRulings, layoutParsingType);
|
||||||
|
|
||||||
Map<String,Float> newValues = calculateMinCharWidthAndMaxCharHeightInsideTable(stripper,spreedSheetArea,10f,1f);
|
Map<String,Float> newValues = calculateMinCharWidthAndMaxCharHeightInsideTable(stripper,spreedSheetArea);
|
||||||
|
|
||||||
cleanRulings = rulingCleaningService.getCleanRulings(pdfTableCells.get(pageNumber), stripper.getRulings(), newValues.get("minCharWidth"), newValues.get("maxCharHeight"));
|
cleanRulings = rulingCleaningService.getCleanRulings(pdfTableCells.get(pageNumber), stripper.getRulings(), newValues.get("minCharWidth"), newValues.get("minCharHeigth"));
|
||||||
|
|
||||||
|
|
||||||
ClassificationPage classificationPage = switch (layoutParsingType) {
|
ClassificationPage classificationPage = switch (layoutParsingType) {
|
||||||
@ -255,17 +255,17 @@ public class LayoutParsingPipeline {
|
|||||||
* @return Map with both values
|
* @return Map with both values
|
||||||
*/
|
*/
|
||||||
|
|
||||||
private Map<String, Float> calculateMinCharWidthAndMaxCharHeightInsideTable(PDFLinesTextStripper stripper, List<Rectangle> spreedSheetArea, float initialMinCharWidth, float initialMaxCharHeight) {
|
private Map<String, Float> calculateMinCharWidthAndMaxCharHeightInsideTable(PDFLinesTextStripper stripper, List<Rectangle> spreedSheetArea) {
|
||||||
|
|
||||||
float newMinCharWidth = initialMinCharWidth;
|
float newMinCharWidth = 10;
|
||||||
float newMaxCharHeight = initialMaxCharHeight;
|
float newMinCharHeight = 30;
|
||||||
Map<String,Float> result = new HashMap<>();
|
Map<String,Float> result = new HashMap<>();
|
||||||
for(var textPositionSequence: stripper.getTextPositionSequences() ) {
|
for(var textPositionSequence: stripper.getTextPositionSequences() ) {
|
||||||
for(var redTextPosition: textPositionSequence.getTextPositions()) {
|
for(var redTextPosition: textPositionSequence.getTextPositions()) {
|
||||||
for(var area: spreedSheetArea) {
|
for(var area: spreedSheetArea) {
|
||||||
if(area.contains(redTextPosition.getPosition()[0], redTextPosition.getPosition()[1], redTextPosition.getPosition()[2], redTextPosition.getPosition()[3])) {
|
if(area.contains(redTextPosition.getPosition()[0], redTextPosition.getPosition()[1], redTextPosition.getPosition()[2], redTextPosition.getPosition()[3])) {
|
||||||
if(redTextPosition.getHeightDir() > newMaxCharHeight) {
|
if(redTextPosition.getHeightDir() < newMinCharHeight) {
|
||||||
newMaxCharHeight = redTextPosition.getHeightDir();
|
newMinCharHeight = redTextPosition.getHeightDir();
|
||||||
}
|
}
|
||||||
if(redTextPosition.getWidthDirAdj() < newMinCharWidth) {
|
if(redTextPosition.getWidthDirAdj() < newMinCharWidth) {
|
||||||
newMinCharWidth = redTextPosition.getWidthDirAdj();
|
newMinCharWidth = redTextPosition.getWidthDirAdj();
|
||||||
@ -275,7 +275,7 @@ public class LayoutParsingPipeline {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
result.put("minCharWidth",newMinCharWidth);
|
result.put("minCharWidth",newMinCharWidth);
|
||||||
result.put("maxCharHeight",newMaxCharHeight);
|
result.put("minCharHeigth",newMinCharHeight);
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@ -35,6 +35,7 @@ public class Table implements SemanticNode {
|
|||||||
int numberOfRows;
|
int numberOfRows;
|
||||||
int numberOfCols;
|
int numberOfCols;
|
||||||
int firstpage;
|
int firstpage;
|
||||||
|
int emptyCells;
|
||||||
TextBlock textBlock;
|
TextBlock textBlock;
|
||||||
|
|
||||||
@Builder.Default
|
@Builder.Default
|
||||||
@ -208,7 +209,6 @@ public class Table implements SemanticNode {
|
|||||||
return IntStream.range(0, numberOfCols).boxed().map(col -> getCell(row, col));
|
return IntStream.range(0, numberOfCols).boxed().map(col -> getCell(row, col));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Streams all TableCells row-wise and filters them with header == true.
|
* Streams all TableCells row-wise and filters them with header == true.
|
||||||
*
|
*
|
||||||
|
|||||||
@ -252,7 +252,7 @@ public class TablePageBlock extends AbstractPageBlock {
|
|||||||
if (prevY != null && prevX != null) {
|
if (prevY != null && prevX != null) {
|
||||||
var cell = new Cell(new Point2D.Float(prevX, prevY), new Point2D.Float(x, y));
|
var cell = new Cell(new Point2D.Float(prevX, prevY), new Point2D.Float(x, y));
|
||||||
|
|
||||||
var intersectionCell = cells.stream().filter(c -> cell.intersects(c) && cell.overlapRatio(c) > 0.1f).findFirst();
|
var intersectionCell = cells.stream().filter(c -> cell.intersects(c)).findFirst();
|
||||||
intersectionCell.ifPresent(value -> cell.getTextBlocks().addAll(value.getTextBlocks()));
|
intersectionCell.ifPresent(value -> cell.getTextBlocks().addAll(value.getTextBlocks()));
|
||||||
if (cell.hasMinimumSize()) {
|
if (cell.hasMinimumSize()) {
|
||||||
row.add(cell);
|
row.add(cell);
|
||||||
|
|||||||
@ -1,6 +1,9 @@
|
|||||||
package com.knecon.fforesight.service.layoutparser.processor.services;
|
package com.knecon.fforesight.service.layoutparser.processor.services;
|
||||||
|
|
||||||
import java.awt.geom.Point2D;
|
import java.awt.geom.Point2D;
|
||||||
|
import java.io.FileOutputStream;
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.nio.file.Path;
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
import java.util.Comparator;
|
import java.util.Comparator;
|
||||||
import java.util.HashMap;
|
import java.util.HashMap;
|
||||||
@ -10,6 +13,8 @@ import java.util.List;
|
|||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
import java.util.Set;
|
import java.util.Set;
|
||||||
|
|
||||||
|
import org.apache.pdfbox.Loader;
|
||||||
|
import org.springframework.core.io.ClassPathResource;
|
||||||
import org.springframework.stereotype.Service;
|
import org.springframework.stereotype.Service;
|
||||||
|
|
||||||
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType;
|
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType;
|
||||||
@ -66,6 +71,20 @@ public class TableExtractionService {
|
|||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
|
public boolean contains(Cell cell, double x, double y, double w, double h) {
|
||||||
|
if (cell.isEmpty() || w <= 0 || h <= 0) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
double x0 = cell.getX();
|
||||||
|
double y0 = cell.getY();
|
||||||
|
return (x >= x0-2 &&
|
||||||
|
y >= y0-2 &&
|
||||||
|
(x + w) <= x0 + cell.getWidth()+2 &&
|
||||||
|
(y + h) <= y0 + cell.getHeight()+2);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Finds tables on a page and moves textblocks into cells of the found tables.
|
* Finds tables on a page and moves textblocks into cells of the found tables.
|
||||||
* Note: This algorithm uses Pdf Coordinate System where {0,0} rotated with the page rotation.
|
* Note: This algorithm uses Pdf Coordinate System where {0,0} rotated with the page rotation.
|
||||||
@ -84,13 +103,12 @@ public class TableExtractionService {
|
|||||||
List<Cell> cells = findCells(cleanRulings.getHorizontal(), cleanRulings.getVertical(), layoutParsingType);
|
List<Cell> cells = findCells(cleanRulings.getHorizontal(), cleanRulings.getVertical(), layoutParsingType);
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
List<TextPageBlock> toBeRemoved = new ArrayList<>();
|
List<TextPageBlock> toBeRemoved = new ArrayList<>();
|
||||||
|
|
||||||
for (AbstractPageBlock abstractPageBlock : page.getTextBlocks()) {
|
for (AbstractPageBlock abstractPageBlock : page.getTextBlocks()) {
|
||||||
TextPageBlock textBlock = (TextPageBlock) abstractPageBlock;
|
TextPageBlock textBlock = (TextPageBlock) abstractPageBlock;
|
||||||
for (Cell cell : cells) {
|
for (Cell cell : cells) {
|
||||||
if (cell.hasMinimumSize() && cell.intersects(textBlock.getPdfMinX(),
|
if (cell.hasMinimumSize() && contains(cell, textBlock.getPdfMinX(),
|
||||||
textBlock.getPdfMinY(),
|
textBlock.getPdfMinY(),
|
||||||
textBlock.getPdfMaxX() - textBlock.getPdfMinX(),
|
textBlock.getPdfMaxX() - textBlock.getPdfMinX(),
|
||||||
textBlock.getPdfMaxY() - textBlock.getPdfMinY())) {
|
textBlock.getPdfMaxY() - textBlock.getPdfMinY())) {
|
||||||
@ -104,7 +122,7 @@ public class TableExtractionService {
|
|||||||
cells = new ArrayList<>(new HashSet<>(cells));
|
cells = new ArrayList<>(new HashSet<>(cells));
|
||||||
DoubleComparisons.sort(cells, Rectangle.ILL_DEFINED_ORDER);
|
DoubleComparisons.sort(cells, Rectangle.ILL_DEFINED_ORDER);
|
||||||
|
|
||||||
List<Rectangle> spreadsheetAreas = findSpreadsheetsFromCells(cells).stream().filter(r -> r.getWidth() > 0f && r.getHeight() > 0f).toList();
|
List<Rectangle> spreadsheetAreas = findSpreadsheetsFromCells(cells);
|
||||||
|
|
||||||
List<TablePageBlock> tables = new ArrayList<>();
|
List<TablePageBlock> tables = new ArrayList<>();
|
||||||
for (Rectangle area : spreadsheetAreas) {
|
for (Rectangle area : spreadsheetAreas) {
|
||||||
@ -131,9 +149,18 @@ public class TableExtractionService {
|
|||||||
if (position != -1) {
|
if (position != -1) {
|
||||||
page.getTextBlocks().add(position, table);
|
page.getTextBlocks().add(position, table);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
String tmpFileName = "C:/Users/YANNIK~1/AppData/Local/Temp/page1.tables.html";
|
||||||
|
try (FileOutputStream fileOutputStream = new FileOutputStream(Path.of(tmpFileName).toFile())) {
|
||||||
|
fileOutputStream.write(table.getTextAsHtml().getBytes());
|
||||||
|
}
|
||||||
|
catch (IOException e) {
|
||||||
|
throw new RuntimeException(e);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
page.getTextBlocks().removeAll(toBeRemoved);
|
page.getTextBlocks().removeAll(toBeRemoved);
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
public List<Rectangle> getSpreadSheetArea(CleanRulings cleanRulings, LayoutParsingType layoutParsingType) {
|
public List<Rectangle> getSpreadSheetArea(CleanRulings cleanRulings, LayoutParsingType layoutParsingType) {
|
||||||
|
|||||||
@ -2,14 +2,31 @@ package com.knecon.fforesight.service.layoutparser.server.graph;
|
|||||||
|
|
||||||
import java.io.FileOutputStream;
|
import java.io.FileOutputStream;
|
||||||
import java.nio.file.Path;
|
import java.nio.file.Path;
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
import org.apache.pdfbox.Loader;
|
import org.apache.pdfbox.Loader;
|
||||||
|
import org.apache.pdfbox.pdmodel.PDDocument;
|
||||||
import org.junit.jupiter.api.Disabled;
|
import org.junit.jupiter.api.Disabled;
|
||||||
import org.junit.jupiter.api.Test;
|
import org.junit.jupiter.api.Test;
|
||||||
|
import org.springframework.beans.factory.annotation.Autowired;
|
||||||
import org.springframework.core.io.ClassPathResource;
|
import org.springframework.core.io.ClassPathResource;
|
||||||
|
|
||||||
|
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentData;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentStructure;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeType;
|
||||||
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType;
|
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationDocument;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document;
|
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Table;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.model.table.Cell;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.model.table.TablePageBlock;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.image.ImageServiceResponse;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableServiceResponse;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.services.SectionsBuilderService;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.services.classification.RedactManagerClassificationService;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.services.factory.DocumentGraphFactory;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.services.mapper.DocumentDataMapper;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.services.mapper.PropertiesMapper;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.services.visualization.LayoutGridService;
|
import com.knecon.fforesight.service.layoutparser.processor.services.visualization.LayoutGridService;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.services.visualization.ViewerDocumentService;
|
import com.knecon.fforesight.service.layoutparser.processor.services.visualization.ViewerDocumentService;
|
||||||
import com.knecon.fforesight.service.layoutparser.server.utils.BuildDocumentTest;
|
import com.knecon.fforesight.service.layoutparser.server.utils.BuildDocumentTest;
|
||||||
@ -18,19 +35,80 @@ import lombok.SneakyThrows;
|
|||||||
|
|
||||||
public class ViewerDocumentTest extends BuildDocumentTest {
|
public class ViewerDocumentTest extends BuildDocumentTest {
|
||||||
|
|
||||||
|
@Autowired
|
||||||
|
private SectionsBuilderService sectionsBuilderService;
|
||||||
|
|
||||||
|
@Autowired
|
||||||
|
private RedactManagerClassificationService redactManagerClassificationService;
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
@Disabled
|
|
||||||
@SneakyThrows
|
@SneakyThrows
|
||||||
public void testViewerDocument() {
|
public void testViewerDocument() {
|
||||||
|
|
||||||
|
String fileName = "files/2Tables.pdf";
|
||||||
|
String tmpFileName = "C:/Users/YANNIK~1/AppData/Local/Temp/2Tables.lines.pdf";
|
||||||
LayoutGridService layoutGridService = new LayoutGridService();
|
LayoutGridService layoutGridService = new LayoutGridService();
|
||||||
ViewerDocumentService viewerDocumentService = new ViewerDocumentService(layoutGridService);
|
ViewerDocumentService viewerDocumentService = new ViewerDocumentService(layoutGridService);
|
||||||
String fileName = "files/bdr/notMergedParagraphs.pdf";
|
|
||||||
Document document = buildGraph(fileName, LayoutParsingType.TAAS);
|
Document document = buildGraph(fileName, LayoutParsingType.TAAS);
|
||||||
String tmpFileName = "/tmp/" + Path.of(fileName).getFileName() + "_VIEWER.pdf";
|
|
||||||
try (var pdDocument = Loader.loadPDF(new ClassPathResource(fileName).getFile()); var out = new FileOutputStream(tmpFileName)) {
|
try (var pdDocument = Loader.loadPDF(new ClassPathResource(fileName).getFile()); var out = new FileOutputStream(tmpFileName)) {
|
||||||
viewerDocumentService.createViewerDocument(pdDocument, document, out, true);
|
viewerDocumentService.createViewerDocument(pdDocument, document, out, true);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
@SneakyThrows
|
||||||
|
public void testTableViewerDocument() {
|
||||||
|
|
||||||
|
String fileName = "C:\\Users\\YannikHampe\\repos\\layout-parser\\layoutparser-service\\layoutparser-service-server\\src\\test\\resources\\files\\SinglePages\\VV-931175_Page1.pdf";
|
||||||
|
String tmpFileName = "C:/Users/YANNIK~1/AppData/Local/Temp/page1.lines.pdf";
|
||||||
|
Document documentGraph = DocumentGraphFactory.buildDocumentGraph(layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER,
|
||||||
|
Loader.loadPDF(Path.of(fileName).toFile()),
|
||||||
|
new ImageServiceResponse(),
|
||||||
|
new TableServiceResponse()));
|
||||||
|
LayoutGridService layoutGridService = new LayoutGridService();
|
||||||
|
ViewerDocumentService viewerDocumentService = new ViewerDocumentService(layoutGridService);
|
||||||
|
try (var pdDocument = Loader.loadPDF(Path.of(fileName).toFile()); var out = new FileOutputStream(tmpFileName)) {
|
||||||
|
viewerDocumentService.createViewerDocument(pdDocument, documentGraph, out, true);
|
||||||
|
}
|
||||||
|
//durch rows
|
||||||
|
DocumentData documentData = DocumentDataMapper.toDocumentData(documentGraph);
|
||||||
|
int emptyCellCount = 0;
|
||||||
|
List listStructure2 = documentData.getDocumentStructure()
|
||||||
|
.streamAllEntries()
|
||||||
|
.filter(entryData -> entryData.getType().equals(NodeType.TABLE))
|
||||||
|
.map(DocumentStructure.EntryData::getProperties)
|
||||||
|
.map(properties -> {
|
||||||
|
var builder = Table.builder();
|
||||||
|
PropertiesMapper.parseTableProperties(properties, builder);
|
||||||
|
return builder.build();
|
||||||
|
}).toList();
|
||||||
|
for(int i = 0; i < listStructure2.size(); i++) {
|
||||||
|
emptyCellCount = ((Table) listStructure2.get(i)).getEmptyCells();
|
||||||
|
}
|
||||||
|
|
||||||
|
System.out.println("Empty cells "+emptyCellCount);
|
||||||
|
|
||||||
|
ClassificationDocument document = buildClassificationDocument(Loader.loadPDF(Path.of(fileName).toFile()));
|
||||||
|
TablePageBlock table = document.getSections().stream().flatMap(paragraph -> paragraph.getTables().stream()).toList().get(0);
|
||||||
|
int emptyCellsFoundFound = table.getRows().stream().flatMap(List::stream).toList().stream().filter(f -> f.toString().equals("")).toList().size();
|
||||||
|
for (List<Cell> row : table.getRows()) {
|
||||||
|
System.out.println(row.toString());
|
||||||
|
}
|
||||||
|
System.out.println("Actual number of empty rows: "+emptyCellsFoundFound);
|
||||||
|
}
|
||||||
|
|
||||||
|
public ClassificationDocument buildClassificationDocument(PDDocument originDocument) {
|
||||||
|
|
||||||
|
ClassificationDocument classificationDocument = layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER,
|
||||||
|
originDocument,
|
||||||
|
new ImageServiceResponse(),
|
||||||
|
new TableServiceResponse());
|
||||||
|
|
||||||
|
redactManagerClassificationService.classifyDocument(classificationDocument);
|
||||||
|
|
||||||
|
sectionsBuilderService.buildSections(classificationDocument);
|
||||||
|
|
||||||
|
return classificationDocument;
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|||||||
@ -29,6 +29,8 @@ import java.util.stream.Collectors;
|
|||||||
|
|
||||||
import static org.assertj.core.api.Assertions.assertThat;
|
import static org.assertj.core.api.Assertions.assertThat;
|
||||||
|
|
||||||
|
import javax.sound.midi.SysexMessage;
|
||||||
|
|
||||||
public class PdfSegmentationServiceTest extends AbstractTest {
|
public class PdfSegmentationServiceTest extends AbstractTest {
|
||||||
|
|
||||||
@Autowired
|
@Autowired
|
||||||
@ -52,7 +54,7 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
|||||||
|
|
||||||
public ClassificationDocument buildClassificationDocument(PDDocument originDocument) {
|
public ClassificationDocument buildClassificationDocument(PDDocument originDocument) {
|
||||||
|
|
||||||
ClassificationDocument classificationDocument = layoutParsingPipeline.parseLayout(LayoutParsingType.DOCUMINE,
|
ClassificationDocument classificationDocument = layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER,
|
||||||
originDocument,
|
originDocument,
|
||||||
new ImageServiceResponse(),
|
new ImageServiceResponse(),
|
||||||
new TableServiceResponse());
|
new TableServiceResponse());
|
||||||
@ -166,8 +168,8 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
|||||||
|
|
||||||
validateTable(document, 0, 1, 1, 0, 0);
|
validateTable(document, 0, 1, 1, 0, 0);
|
||||||
validateTable(document, 1, 2, 2, 0, 0);
|
validateTable(document, 1, 2, 2, 0, 0);
|
||||||
validateTable(document, 2, 7, 20, 0, 140);
|
validateTable(document, 2, 7, 20, 0, 0);
|
||||||
validateTable(document, 3, 8, 31, 0, 170);
|
validateTable(document, 3, 8, 31, 0, 0);
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -181,7 +183,7 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
|||||||
|
|
||||||
validateTableSize(document, 1);
|
validateTableSize(document, 1);
|
||||||
|
|
||||||
validateTable(document, 0, 8, 8, 0, 2);
|
validateTable(document, 0, 8, 8, 0, 0);
|
||||||
|
|
||||||
List<List<String>> values = Arrays.asList(Arrays.asList("Annex point Reference within DAR/RAR",
|
List<List<String>> values = Arrays.asList(Arrays.asList("Annex point Reference within DAR/RAR",
|
||||||
"Author, date",
|
"Author, date",
|
||||||
@ -191,18 +193,18 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
|||||||
"Method meets analytical validation criteria",
|
"Method meets analytical validation criteria",
|
||||||
"Remarks (in case validation criteria are not met)",
|
"Remarks (in case validation criteria are not met)",
|
||||||
"Acceptability of the method"),
|
"Acceptability of the method"),
|
||||||
Arrays.asList("",
|
Arrays.asList("Part (a) Methods in soil, water, sediment, air and any additional matrices used in support of environmental fate studies",
|
||||||
|
"Part (a) Methods in soil, water, sediment, air and any additional matrices used in support of environmental fate studies",
|
||||||
"Part (a) Methods in soil, water, sediment, air and any additional matrices used in support of environmental fate studies",
|
"Part (a) Methods in soil, water, sediment, air and any additional matrices used in support of environmental fate studies",
|
||||||
"Part (a) Methods in soil, water, sediment, air and any additional matrices used in support of environmental fate studies",
|
"Part (a) Methods in soil, water, sediment, air and any additional matrices used in support of environmental fate studies",
|
||||||
"Part (a) Methods in soil, water, sediment, air and any additional matrices used in support of environmental fate studies",
|
"Part (a) Methods in soil, water, sediment, air and any additional matrices used in support of environmental fate studies",
|
||||||
"Part (a) Methods in soil, water, sediment, air and any additional matrices used in support of environmental fate studies",
|
"Part (a) Methods in soil, water, sediment, air and any additional matrices used in support of environmental fate studies",
|
||||||
"",
|
|
||||||
"Part (a) Methods in soil, water, sediment, air and any additional matrices used in support of environmental fate studies",
|
"Part (a) Methods in soil, water, sediment, air and any additional matrices used in support of environmental fate studies",
|
||||||
"Part (a) Methods in soil, water, sediment, air and any additional matrices used in support of environmental fate studies"),
|
"Part (a) Methods in soil, water, sediment, air and any additional matrices used in support of environmental fate studies"),
|
||||||
Arrays.asList("CA 7.1.2.1.1 DAR (2009)",
|
Arrays.asList("CA 7.1.2.1.1 DAR (2009)",
|
||||||
"Evans P.G. 2001 TMJ4569B, VV-323245",
|
"Evans P.G. 2001 TMJ4569B, VV-323245",
|
||||||
"Azoxystrobin Laboratory Degradation Study in Three Soil Types, Sampled from Holland and the United Kingdom",
|
"Azoxystrobin Laboratory Degradation Study in Three Soil Types, Sampled from Holland and the United Kingdom",
|
||||||
"Method: RAM 269 Johnson R.I., Tummon O.J., Earl M. 1995 RJ1864B, VV-377731 Johnson R.I., Tummon O.J., Earl M. 1998 RAM 269/02, VV-124072 Johnson R.I., Tummon O.J., Earl M. 2000 RAM 269/03, VV-123986 Validation: Robinson N.J. 2001 TMJ4617B, VV-895845 in a Trial Carried",
|
"Method: RAM 269 Johnson R.I., Tummon O.J., Earl M. 1995 RJ1864B, VV-377731 Johnson R.I., Tummon O.J., Earl M. 1998 RAM 269/02, VV-124072 Johnson R.I., Tummon O.J., Earl M. 2000 RAM 269/03, VV-123986 Validation: Robinson N.J. 2001 TMJ4617B, VV-895845",
|
||||||
"LC-MS/MS LOQ: 0.01 mg/kg (R401553 (SYN50165 7), R402173 (SYN501114 )) or 0.02 mg/kg (azoxystrobin, R230310, R234886) Working range: 0.02-1.0 or 0.01-0.5 mg/kg (depending on analyte) Other supporting quantificati on methods: HPLC-UV GC-MSD",
|
"LC-MS/MS LOQ: 0.01 mg/kg (R401553 (SYN50165 7), R402173 (SYN501114 )) or 0.02 mg/kg (azoxystrobin, R230310, R234886) Working range: 0.02-1.0 or 0.01-0.5 mg/kg (depending on analyte) Other supporting quantificati on methods: HPLC-UV GC-MSD",
|
||||||
"Y",
|
"Y",
|
||||||
"N/A",
|
"N/A",
|
||||||
@ -239,8 +241,8 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
|||||||
|
|
||||||
validateTableSize(document, 2);
|
validateTableSize(document, 2);
|
||||||
|
|
||||||
validateTable(document, 0, 5, 5, 0, 23);
|
validateTable(document, 0, 5, 5, 0, 0);
|
||||||
validateTable(document, 1, 11, 9, 0, 36);
|
validateTable(document, 1, 11, 9, 0, 0);
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -328,7 +330,7 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
|||||||
ClassificationDocument document = buildClassificationDocument(Loader.loadPDF(pdfFileResource.getFile()));
|
ClassificationDocument document = buildClassificationDocument(Loader.loadPDF(pdfFileResource.getFile()));
|
||||||
|
|
||||||
validateTableSize(document, 1);
|
validateTableSize(document, 1);
|
||||||
validateTable(document, 0, 10, 6, 0, 1);
|
validateTable(document, 0, 10, 6, 0, 0);
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -450,8 +452,8 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
|||||||
|
|
||||||
validateTableSize(document, 2);
|
validateTableSize(document, 2);
|
||||||
|
|
||||||
validateTable(document, 0, 6, 8, 0, 2);
|
validateTable(document, 0, 6, 8, 0, 0);
|
||||||
validateTable(document, 1, 6, 8, 0, 1);
|
validateTable(document, 1, 6, 8, 0, 0);
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -465,7 +467,7 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
|||||||
|
|
||||||
validateTableSize(document, 1);
|
validateTableSize(document, 1);
|
||||||
|
|
||||||
validateTable(document, 0, 9, 5, 2, 0);
|
validateTable(document, 0, 9, 5, 0, 0);
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -490,6 +492,9 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
|||||||
List<List<Cell>> rows = table.getRows();
|
List<List<Cell>> rows = table.getRows();
|
||||||
int emptyCellsFoundFound = rows.stream().flatMap(List::stream).toList().stream().filter(f -> f.toString().equals("")).toList().size();
|
int emptyCellsFoundFound = rows.stream().flatMap(List::stream).toList().stream().filter(f -> f.toString().equals("")).toList().size();
|
||||||
|
|
||||||
|
for (List<Cell> row : table.getRows()) {
|
||||||
|
row.forEach(r -> System.out.println(r.toString()));
|
||||||
|
}
|
||||||
assertThat(emptyCellsFoundFound).isEqualTo(emptyCellsCountCorrect + emptyCellsCountIncorrect);
|
assertThat(emptyCellsFoundFound).isEqualTo(emptyCellsCountCorrect + emptyCellsCountIncorrect);
|
||||||
|
|
||||||
assertThat(table.getColCount()).isEqualTo(colCount);
|
assertThat(table.getColCount()).isEqualTo(colCount);
|
||||||
|
|||||||
@ -1,37 +1,159 @@
|
|||||||
package com.knecon.fforesight.service.layoutparser.server.services;
|
package com.knecon.fforesight.service.layoutparser.server.services;
|
||||||
|
|
||||||
|
import java.io.File;
|
||||||
|
import java.io.FileOutputStream;
|
||||||
|
import java.nio.file.Files;
|
||||||
import java.nio.file.Path;
|
import java.nio.file.Path;
|
||||||
|
import java.nio.file.Paths;
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.Arrays;
|
||||||
import java.util.Collections;
|
import java.util.Collections;
|
||||||
import java.util.LinkedList;
|
import java.util.LinkedList;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
|
||||||
import org.junit.jupiter.api.Test;
|
import javax.print.Doc;
|
||||||
|
|
||||||
|
import org.apache.pdfbox.Loader;
|
||||||
|
import org.apache.pdfbox.pdmodel.PDDocument;
|
||||||
|
import org.junit.jupiter.api.Test;
|
||||||
|
import org.springframework.core.io.ClassPathResource;
|
||||||
|
|
||||||
|
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||||
|
import com.iqser.red.commons.jackson.ObjectMapperFactory;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentData;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentStructure;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeType;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.PageContents;
|
import com.knecon.fforesight.service.layoutparser.processor.model.PageContents;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Table;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings;
|
import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.image.ImageServiceResponse;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableServiceResponse;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.services.PageContentExtractor;
|
import com.knecon.fforesight.service.layoutparser.processor.services.PageContentExtractor;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.services.RulingCleaningService;
|
import com.knecon.fforesight.service.layoutparser.processor.services.RulingCleaningService;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.services.factory.DocumentGraphFactory;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.services.mapper.DocumentDataMapper;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.services.mapper.PropertiesMapper;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.services.parsing.PDFLinesTextStripper;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.services.visualization.LayoutGridService;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.services.visualization.ViewerDocumentService;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.server.utils.BuildDocumentTest;
|
||||||
import com.knecon.fforesight.service.layoutparser.server.utils.visualizations.PdfDraw;
|
import com.knecon.fforesight.service.layoutparser.server.utils.visualizations.PdfDraw;
|
||||||
|
|
||||||
import lombok.SneakyThrows;
|
import lombok.SneakyThrows;
|
||||||
|
|
||||||
public class RulingCleaningServiceTest {
|
public class RulingCleaningServiceTest extends BuildDocumentTest {
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
// @Disabled
|
// @Disabled
|
||||||
@SneakyThrows
|
@SneakyThrows
|
||||||
public void textRulingExtraction() {
|
public void textRulingExtraction() {
|
||||||
|
|
||||||
String fileName = "files/211.pdf";
|
String fileName = "/files/102 S-Metolachlor_RAR_02_Volume_2_2018-09-06.pdf";
|
||||||
String lineFileName = "/tmp/" + Path.of(fileName).getFileName().toString() + "_LINES.pdf";
|
String lineFileName = "C:/Users/YANNIK~1/AppData/Local/Temp/102 S-Metolachlor_RAR_02_Volume_2_2018-09-06.after.pdf";
|
||||||
List<PageContents> pageContents = PageContentExtractor.getSortedPageContents(fileName);
|
List<PageContents> pageContents = PageContentExtractor.getSortedPageContents(fileName);
|
||||||
PdfDraw.drawLinesPerPage(fileName, pageContents.stream().map(PageContents::getRulings).toList(), lineFileName);
|
|
||||||
|
|
||||||
RulingCleaningService rulingCleaningService = new RulingCleaningService();
|
RulingCleaningService rulingCleaningService = new RulingCleaningService();
|
||||||
List<CleanRulings> cleanRulingsPerPage = new LinkedList<>();
|
List<CleanRulings> cleanRulingsPerPage = new LinkedList<>();
|
||||||
|
writeJsons(Path.of(fileName));
|
||||||
for (PageContents pageContent : pageContents) {
|
for (PageContents pageContent : pageContents) {
|
||||||
cleanRulingsPerPage.add(rulingCleaningService.getCleanRulings(Collections.emptyList(), pageContent.getRulings(), 8, 20));
|
cleanRulingsPerPage.add(rulingCleaningService.getCleanRulings(Collections.emptyList(), pageContent.getRulings(), 8, 1));
|
||||||
|
}
|
||||||
|
PdfDraw.drawLinesPerPage(fileName, pageContents.stream().map(PageContents::getRulings).toList(), lineFileName);
|
||||||
|
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
@SneakyThrows
|
||||||
|
public void testTableExtractionSingle() {
|
||||||
|
String filename ="C:\\Users\\YannikHampe\\repos\\layout-parser\\layoutparser-service\\layoutparser-service-server\\src\\test\\resources\\files\\SinglePages\\24 - SYN549522 - Acute Oral Toxicity - Rats_Page17.pdf";
|
||||||
|
writeJsons(Path.of(filename));
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
@SneakyThrows
|
||||||
|
public void testTableExtraction() {
|
||||||
|
|
||||||
|
|
||||||
|
LayoutGridService layoutGridService = new LayoutGridService();
|
||||||
|
ViewerDocumentService viewerDocumentService = new ViewerDocumentService(layoutGridService);
|
||||||
|
|
||||||
|
ClassPathResource resource = new ClassPathResource("files");
|
||||||
|
List<String> pdfFileNames = Files.walk(resource.getFile().toPath())
|
||||||
|
.filter(path -> path.getFileName().toString().endsWith(".pdf"))
|
||||||
|
.map(Path::toAbsolutePath)
|
||||||
|
.map(Path::toString)
|
||||||
|
.toList();
|
||||||
|
|
||||||
|
for (int i = 0; i < pdfFileNames.size(); i++) {
|
||||||
|
writeJsons(Path.of(pdfFileNames.get(i)));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@SneakyThrows
|
||||||
|
private void writeJsons(Path filename) {
|
||||||
|
|
||||||
|
Document documentGraphBefore = DocumentGraphFactory.buildDocumentGraph(layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER,
|
||||||
|
Loader.loadPDF(filename.toFile()),
|
||||||
|
new ImageServiceResponse(),
|
||||||
|
new TableServiceResponse()));
|
||||||
|
Document documentGraphAfter = DocumentGraphFactory.buildDocumentGraph(layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER,
|
||||||
|
Loader.loadPDF(filename.toFile()),
|
||||||
|
new ImageServiceResponse(),
|
||||||
|
new TableServiceResponse()));
|
||||||
|
DocumentData documentDataBefore = DocumentDataMapper.toDocumentData(documentGraphBefore);
|
||||||
|
DocumentData documentDataAfter = DocumentDataMapper.toDocumentData(documentGraphAfter);
|
||||||
|
if(!compareStructures(documentDataBefore.getDocumentStructure(), documentDataAfter.getDocumentStructure(), filename.getFileName().toString())) {
|
||||||
|
String tmpFileNameBefore = "C:/Users/YANNIK~1/AppData/Local/Temp/before."+filename.getFileName().toString();;
|
||||||
|
System.out.println(tmpFileNameBefore);
|
||||||
|
try (PDDocument pdDocument = Loader.loadPDF(filename.toFile())) {
|
||||||
|
PdfDraw.drawDocumentGraph(pdDocument, documentGraphBefore);
|
||||||
|
pdDocument.save(tmpFileNameBefore);
|
||||||
|
}
|
||||||
|
String tmpFileNameAfter = "C:/Users/YANNIK~1/AppData/Local/Temp/after."+filename.getFileName().toString();;
|
||||||
|
System.out.println(tmpFileNameAfter);
|
||||||
|
try (PDDocument pdDocument = Loader.loadPDF(filename.toFile())) {
|
||||||
|
PdfDraw.drawDocumentGraph(pdDocument, documentGraphAfter);
|
||||||
|
pdDocument.save(tmpFileNameAfter);
|
||||||
|
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
@SneakyThrows
|
||||||
|
private boolean compareStructures(DocumentStructure structure1, DocumentStructure structure2, String pdfName) {
|
||||||
|
|
||||||
|
|
||||||
|
List listStructure1 = structure1
|
||||||
|
.streamAllEntries()
|
||||||
|
.filter(entryData -> entryData.getType().equals(NodeType.TABLE))
|
||||||
|
.map(DocumentStructure.EntryData::getProperties)
|
||||||
|
.map(properties -> {
|
||||||
|
var builder = Table.builder();
|
||||||
|
PropertiesMapper.parseTableProperties(properties, builder);
|
||||||
|
return builder.build();
|
||||||
|
}).toList();
|
||||||
|
|
||||||
|
List listStructure2 = structure2
|
||||||
|
.streamAllEntries()
|
||||||
|
.filter(entryData -> entryData.getType().equals(NodeType.TABLE))
|
||||||
|
.map(DocumentStructure.EntryData::getProperties)
|
||||||
|
.map(properties -> {
|
||||||
|
var builder = Table.builder();
|
||||||
|
PropertiesMapper.parseTableProperties(properties, builder);
|
||||||
|
return builder.build();
|
||||||
|
}).toList();
|
||||||
|
|
||||||
|
|
||||||
|
for(int i = 0; i < listStructure1.size(); i++) {
|
||||||
|
Table tableNode1 = (Table) listStructure1.get(i);
|
||||||
|
Table tableNode2 = (Table) listStructure2.get(i);
|
||||||
|
if(tableNode1.getNumberOfRows() != tableNode2.getNumberOfRows() || tableNode1.getNumberOfCols() != tableNode2.getNumberOfCols()) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user