* fixed bug with incorrect empty cell count by adding threshhold to cell.contains

This commit is contained in:
yhampe 2023-11-15 10:44:47 +01:00
parent f69331e7d8
commit c3e69b2cdf
7 changed files with 270 additions and 38 deletions

View File

@ -190,14 +190,14 @@ public class LayoutParsingPipeline {
PDRectangle cropbox = pdPage.getCropBox();
CleanRulings cleanRulings = rulingCleaningService.getCleanRulings(pdfTableCells.get(pageNumber),
stripper.getRulings(),
1,
1);
stripper.getMinCharWidth(),
stripper.getMaxCharHeight());
List<Rectangle> spreedSheetArea = tableExtractionService.getSpreadSheetArea(cleanRulings, layoutParsingType);
Map<String,Float> newValues = calculateMinCharWidthAndMaxCharHeightInsideTable(stripper,spreedSheetArea,10f,1f);
Map<String,Float> newValues = calculateMinCharWidthAndMaxCharHeightInsideTable(stripper,spreedSheetArea);
cleanRulings = rulingCleaningService.getCleanRulings(pdfTableCells.get(pageNumber), stripper.getRulings(), newValues.get("minCharWidth"), newValues.get("maxCharHeight"));
cleanRulings = rulingCleaningService.getCleanRulings(pdfTableCells.get(pageNumber), stripper.getRulings(), newValues.get("minCharWidth"), newValues.get("minCharHeigth"));
ClassificationPage classificationPage = switch (layoutParsingType) {
@ -255,17 +255,17 @@ public class LayoutParsingPipeline {
* @return Map with both values
*/
private Map<String, Float> calculateMinCharWidthAndMaxCharHeightInsideTable(PDFLinesTextStripper stripper, List<Rectangle> spreedSheetArea, float initialMinCharWidth, float initialMaxCharHeight) {
private Map<String, Float> calculateMinCharWidthAndMaxCharHeightInsideTable(PDFLinesTextStripper stripper, List<Rectangle> spreedSheetArea) {
float newMinCharWidth = initialMinCharWidth;
float newMaxCharHeight = initialMaxCharHeight;
float newMinCharWidth = 10;
float newMinCharHeight = 30;
Map<String,Float> result = new HashMap<>();
for(var textPositionSequence: stripper.getTextPositionSequences() ) {
for(var redTextPosition: textPositionSequence.getTextPositions()) {
for(var area: spreedSheetArea) {
if(area.contains(redTextPosition.getPosition()[0], redTextPosition.getPosition()[1], redTextPosition.getPosition()[2], redTextPosition.getPosition()[3])) {
if(redTextPosition.getHeightDir() > newMaxCharHeight) {
newMaxCharHeight = redTextPosition.getHeightDir();
if(redTextPosition.getHeightDir() < newMinCharHeight) {
newMinCharHeight = redTextPosition.getHeightDir();
}
if(redTextPosition.getWidthDirAdj() < newMinCharWidth) {
newMinCharWidth = redTextPosition.getWidthDirAdj();
@ -275,7 +275,7 @@ public class LayoutParsingPipeline {
}
}
result.put("minCharWidth",newMinCharWidth);
result.put("maxCharHeight",newMaxCharHeight);
result.put("minCharHeigth",newMinCharHeight);
return result;
}

View File

@ -35,6 +35,7 @@ public class Table implements SemanticNode {
int numberOfRows;
int numberOfCols;
int firstpage;
int emptyCells;
TextBlock textBlock;
@Builder.Default
@ -208,7 +209,6 @@ public class Table implements SemanticNode {
return IntStream.range(0, numberOfCols).boxed().map(col -> getCell(row, col));
}
/**
* Streams all TableCells row-wise and filters them with header == true.
*

View File

@ -252,7 +252,7 @@ public class TablePageBlock extends AbstractPageBlock {
if (prevY != null && prevX != null) {
var cell = new Cell(new Point2D.Float(prevX, prevY), new Point2D.Float(x, y));
var intersectionCell = cells.stream().filter(c -> cell.intersects(c) && cell.overlapRatio(c) > 0.1f).findFirst();
var intersectionCell = cells.stream().filter(c -> cell.intersects(c)).findFirst();
intersectionCell.ifPresent(value -> cell.getTextBlocks().addAll(value.getTextBlocks()));
if (cell.hasMinimumSize()) {
row.add(cell);

View File

@ -1,6 +1,9 @@
package com.knecon.fforesight.service.layoutparser.processor.services;
import java.awt.geom.Point2D;
import java.io.FileOutputStream;
import java.io.IOException;
import java.nio.file.Path;
import java.util.ArrayList;
import java.util.Comparator;
import java.util.HashMap;
@ -10,6 +13,8 @@ import java.util.List;
import java.util.Map;
import java.util.Set;
import org.apache.pdfbox.Loader;
import org.springframework.core.io.ClassPathResource;
import org.springframework.stereotype.Service;
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType;
@ -66,6 +71,20 @@ public class TableExtractionService {
};
public boolean contains(Cell cell, double x, double y, double w, double h) {
if (cell.isEmpty() || w <= 0 || h <= 0) {
return false;
}
double x0 = cell.getX();
double y0 = cell.getY();
return (x >= x0-2 &&
y >= y0-2 &&
(x + w) <= x0 + cell.getWidth()+2 &&
(y + h) <= y0 + cell.getHeight()+2);
}
/**
* Finds tables on a page and moves textblocks into cells of the found tables.
* Note: This algorithm uses Pdf Coordinate System where {0,0} rotated with the page rotation.
@ -84,13 +103,12 @@ public class TableExtractionService {
List<Cell> cells = findCells(cleanRulings.getHorizontal(), cleanRulings.getVertical(), layoutParsingType);
List<TextPageBlock> toBeRemoved = new ArrayList<>();
for (AbstractPageBlock abstractPageBlock : page.getTextBlocks()) {
TextPageBlock textBlock = (TextPageBlock) abstractPageBlock;
for (Cell cell : cells) {
if (cell.hasMinimumSize() && cell.intersects(textBlock.getPdfMinX(),
if (cell.hasMinimumSize() && contains(cell, textBlock.getPdfMinX(),
textBlock.getPdfMinY(),
textBlock.getPdfMaxX() - textBlock.getPdfMinX(),
textBlock.getPdfMaxY() - textBlock.getPdfMinY())) {
@ -104,7 +122,7 @@ public class TableExtractionService {
cells = new ArrayList<>(new HashSet<>(cells));
DoubleComparisons.sort(cells, Rectangle.ILL_DEFINED_ORDER);
List<Rectangle> spreadsheetAreas = findSpreadsheetsFromCells(cells).stream().filter(r -> r.getWidth() > 0f && r.getHeight() > 0f).toList();
List<Rectangle> spreadsheetAreas = findSpreadsheetsFromCells(cells);
List<TablePageBlock> tables = new ArrayList<>();
for (Rectangle area : spreadsheetAreas) {
@ -131,9 +149,18 @@ public class TableExtractionService {
if (position != -1) {
page.getTextBlocks().add(position, table);
}
String tmpFileName = "C:/Users/YANNIK~1/AppData/Local/Temp/page1.tables.html";
try (FileOutputStream fileOutputStream = new FileOutputStream(Path.of(tmpFileName).toFile())) {
fileOutputStream.write(table.getTextAsHtml().getBytes());
}
catch (IOException e) {
throw new RuntimeException(e);
}
}
page.getTextBlocks().removeAll(toBeRemoved);
}
public List<Rectangle> getSpreadSheetArea(CleanRulings cleanRulings, LayoutParsingType layoutParsingType) {

View File

@ -2,14 +2,31 @@ package com.knecon.fforesight.service.layoutparser.server.graph;
import java.io.FileOutputStream;
import java.nio.file.Path;
import java.util.List;
import org.apache.pdfbox.Loader;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.junit.jupiter.api.Disabled;
import org.junit.jupiter.api.Test;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.core.io.ClassPathResource;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentData;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentStructure;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeType;
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType;
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationDocument;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Table;
import com.knecon.fforesight.service.layoutparser.processor.model.table.Cell;
import com.knecon.fforesight.service.layoutparser.processor.model.table.TablePageBlock;
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.image.ImageServiceResponse;
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableServiceResponse;
import com.knecon.fforesight.service.layoutparser.processor.services.SectionsBuilderService;
import com.knecon.fforesight.service.layoutparser.processor.services.classification.RedactManagerClassificationService;
import com.knecon.fforesight.service.layoutparser.processor.services.factory.DocumentGraphFactory;
import com.knecon.fforesight.service.layoutparser.processor.services.mapper.DocumentDataMapper;
import com.knecon.fforesight.service.layoutparser.processor.services.mapper.PropertiesMapper;
import com.knecon.fforesight.service.layoutparser.processor.services.visualization.LayoutGridService;
import com.knecon.fforesight.service.layoutparser.processor.services.visualization.ViewerDocumentService;
import com.knecon.fforesight.service.layoutparser.server.utils.BuildDocumentTest;
@ -18,19 +35,80 @@ import lombok.SneakyThrows;
public class ViewerDocumentTest extends BuildDocumentTest {
@Autowired
private SectionsBuilderService sectionsBuilderService;
@Autowired
private RedactManagerClassificationService redactManagerClassificationService;
@Test
@Disabled
@SneakyThrows
public void testViewerDocument() {
String fileName = "files/2Tables.pdf";
String tmpFileName = "C:/Users/YANNIK~1/AppData/Local/Temp/2Tables.lines.pdf";
LayoutGridService layoutGridService = new LayoutGridService();
ViewerDocumentService viewerDocumentService = new ViewerDocumentService(layoutGridService);
String fileName = "files/bdr/notMergedParagraphs.pdf";
Document document = buildGraph(fileName, LayoutParsingType.TAAS);
String tmpFileName = "/tmp/" + Path.of(fileName).getFileName() + "_VIEWER.pdf";
try (var pdDocument = Loader.loadPDF(new ClassPathResource(fileName).getFile()); var out = new FileOutputStream(tmpFileName)) {
viewerDocumentService.createViewerDocument(pdDocument, document, out, true);
}
}
@Test
@SneakyThrows
public void testTableViewerDocument() {
String fileName = "C:\\Users\\YannikHampe\\repos\\layout-parser\\layoutparser-service\\layoutparser-service-server\\src\\test\\resources\\files\\SinglePages\\VV-931175_Page1.pdf";
String tmpFileName = "C:/Users/YANNIK~1/AppData/Local/Temp/page1.lines.pdf";
Document documentGraph = DocumentGraphFactory.buildDocumentGraph(layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER,
Loader.loadPDF(Path.of(fileName).toFile()),
new ImageServiceResponse(),
new TableServiceResponse()));
LayoutGridService layoutGridService = new LayoutGridService();
ViewerDocumentService viewerDocumentService = new ViewerDocumentService(layoutGridService);
try (var pdDocument = Loader.loadPDF(Path.of(fileName).toFile()); var out = new FileOutputStream(tmpFileName)) {
viewerDocumentService.createViewerDocument(pdDocument, documentGraph, out, true);
}
//durch rows
DocumentData documentData = DocumentDataMapper.toDocumentData(documentGraph);
int emptyCellCount = 0;
List listStructure2 = documentData.getDocumentStructure()
.streamAllEntries()
.filter(entryData -> entryData.getType().equals(NodeType.TABLE))
.map(DocumentStructure.EntryData::getProperties)
.map(properties -> {
var builder = Table.builder();
PropertiesMapper.parseTableProperties(properties, builder);
return builder.build();
}).toList();
for(int i = 0; i < listStructure2.size(); i++) {
emptyCellCount = ((Table) listStructure2.get(i)).getEmptyCells();
}
System.out.println("Empty cells "+emptyCellCount);
ClassificationDocument document = buildClassificationDocument(Loader.loadPDF(Path.of(fileName).toFile()));
TablePageBlock table = document.getSections().stream().flatMap(paragraph -> paragraph.getTables().stream()).toList().get(0);
int emptyCellsFoundFound = table.getRows().stream().flatMap(List::stream).toList().stream().filter(f -> f.toString().equals("")).toList().size();
for (List<Cell> row : table.getRows()) {
System.out.println(row.toString());
}
System.out.println("Actual number of empty rows: "+emptyCellsFoundFound);
}
public ClassificationDocument buildClassificationDocument(PDDocument originDocument) {
ClassificationDocument classificationDocument = layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER,
originDocument,
new ImageServiceResponse(),
new TableServiceResponse());
redactManagerClassificationService.classifyDocument(classificationDocument);
sectionsBuilderService.buildSections(classificationDocument);
return classificationDocument;
}
}

View File

@ -29,6 +29,8 @@ import java.util.stream.Collectors;
import static org.assertj.core.api.Assertions.assertThat;
import javax.sound.midi.SysexMessage;
public class PdfSegmentationServiceTest extends AbstractTest {
@Autowired
@ -52,7 +54,7 @@ public class PdfSegmentationServiceTest extends AbstractTest {
public ClassificationDocument buildClassificationDocument(PDDocument originDocument) {
ClassificationDocument classificationDocument = layoutParsingPipeline.parseLayout(LayoutParsingType.DOCUMINE,
ClassificationDocument classificationDocument = layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER,
originDocument,
new ImageServiceResponse(),
new TableServiceResponse());
@ -166,8 +168,8 @@ public class PdfSegmentationServiceTest extends AbstractTest {
validateTable(document, 0, 1, 1, 0, 0);
validateTable(document, 1, 2, 2, 0, 0);
validateTable(document, 2, 7, 20, 0, 140);
validateTable(document, 3, 8, 31, 0, 170);
validateTable(document, 2, 7, 20, 0, 0);
validateTable(document, 3, 8, 31, 0, 0);
}
@ -181,7 +183,7 @@ public class PdfSegmentationServiceTest extends AbstractTest {
validateTableSize(document, 1);
validateTable(document, 0, 8, 8, 0, 2);
validateTable(document, 0, 8, 8, 0, 0);
List<List<String>> values = Arrays.asList(Arrays.asList("Annex point Reference within DAR/RAR",
"Author, date",
@ -191,18 +193,18 @@ public class PdfSegmentationServiceTest extends AbstractTest {
"Method meets analytical validation criteria",
"Remarks (in case validation criteria are not met)",
"Acceptability of the method"),
Arrays.asList("",
Arrays.asList("Part (a) Methods in soil, water, sediment, air and any additional matrices used in support of environmental fate studies",
"Part (a) Methods in soil, water, sediment, air and any additional matrices used in support of environmental fate studies",
"Part (a) Methods in soil, water, sediment, air and any additional matrices used in support of environmental fate studies",
"Part (a) Methods in soil, water, sediment, air and any additional matrices used in support of environmental fate studies",
"Part (a) Methods in soil, water, sediment, air and any additional matrices used in support of environmental fate studies",
"Part (a) Methods in soil, water, sediment, air and any additional matrices used in support of environmental fate studies",
"",
"Part (a) Methods in soil, water, sediment, air and any additional matrices used in support of environmental fate studies",
"Part (a) Methods in soil, water, sediment, air and any additional matrices used in support of environmental fate studies"),
Arrays.asList("CA 7.1.2.1.1 DAR (2009)",
"Evans P.G. 2001 TMJ4569B, VV-323245",
"Azoxystrobin Laboratory Degradation Study in Three Soil Types, Sampled from Holland and the United Kingdom",
"Method: RAM 269 Johnson R.I., Tummon O.J., Earl M. 1995 RJ1864B, VV-377731 Johnson R.I., Tummon O.J., Earl M. 1998 RAM 269/02, VV-124072 Johnson R.I., Tummon O.J., Earl M. 2000 RAM 269/03, VV-123986 Validation: Robinson N.J. 2001 TMJ4617B, VV-895845 in a Trial Carried",
"Method: RAM 269 Johnson R.I., Tummon O.J., Earl M. 1995 RJ1864B, VV-377731 Johnson R.I., Tummon O.J., Earl M. 1998 RAM 269/02, VV-124072 Johnson R.I., Tummon O.J., Earl M. 2000 RAM 269/03, VV-123986 Validation: Robinson N.J. 2001 TMJ4617B, VV-895845",
"LC-MS/MS LOQ: 0.01 mg/kg (R401553 (SYN50165 7), R402173 (SYN501114 )) or 0.02mg/kg (azoxystrobin, R230310, R234886) Working range: 0.02-1.0 or 0.01-0.5 mg/kg (depending on analyte) Other supporting quantificati on methods: HPLC-UV GC-MSD",
"Y",
"N/A",
@ -239,8 +241,8 @@ public class PdfSegmentationServiceTest extends AbstractTest {
validateTableSize(document, 2);
validateTable(document, 0, 5, 5, 0, 23);
validateTable(document, 1, 11, 9, 0, 36);
validateTable(document, 0, 5, 5, 0, 0);
validateTable(document, 1, 11, 9, 0, 0);
}
@ -328,7 +330,7 @@ public class PdfSegmentationServiceTest extends AbstractTest {
ClassificationDocument document = buildClassificationDocument(Loader.loadPDF(pdfFileResource.getFile()));
validateTableSize(document, 1);
validateTable(document, 0, 10, 6, 0, 1);
validateTable(document, 0, 10, 6, 0, 0);
}
@ -450,8 +452,8 @@ public class PdfSegmentationServiceTest extends AbstractTest {
validateTableSize(document, 2);
validateTable(document, 0, 6, 8, 0, 2);
validateTable(document, 1, 6, 8, 0, 1);
validateTable(document, 0, 6, 8, 0, 0);
validateTable(document, 1, 6, 8, 0, 0);
}
@ -465,7 +467,7 @@ public class PdfSegmentationServiceTest extends AbstractTest {
validateTableSize(document, 1);
validateTable(document, 0, 9, 5, 2, 0);
validateTable(document, 0, 9, 5, 0, 0);
}
@ -490,6 +492,9 @@ public class PdfSegmentationServiceTest extends AbstractTest {
List<List<Cell>> rows = table.getRows();
int emptyCellsFoundFound = rows.stream().flatMap(List::stream).toList().stream().filter(f -> f.toString().equals("")).toList().size();
for (List<Cell> row : table.getRows()) {
row.forEach(r -> System.out.println(r.toString()));
}
assertThat(emptyCellsFoundFound).isEqualTo(emptyCellsCountCorrect + emptyCellsCountIncorrect);
assertThat(table.getColCount()).isEqualTo(colCount);

View File

@ -1,37 +1,159 @@
package com.knecon.fforesight.service.layoutparser.server.services;
import java.io.File;
import java.io.FileOutputStream;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.LinkedList;
import java.util.List;
import org.junit.jupiter.api.Test;
import javax.print.Doc;
import org.apache.pdfbox.Loader;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.junit.jupiter.api.Test;
import org.springframework.core.io.ClassPathResource;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.iqser.red.commons.jackson.ObjectMapperFactory;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentData;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentStructure;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeType;
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType;
import com.knecon.fforesight.service.layoutparser.processor.model.PageContents;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Table;
import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings;
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.image.ImageServiceResponse;
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableServiceResponse;
import com.knecon.fforesight.service.layoutparser.processor.services.PageContentExtractor;
import com.knecon.fforesight.service.layoutparser.processor.services.RulingCleaningService;
import com.knecon.fforesight.service.layoutparser.processor.services.factory.DocumentGraphFactory;
import com.knecon.fforesight.service.layoutparser.processor.services.mapper.DocumentDataMapper;
import com.knecon.fforesight.service.layoutparser.processor.services.mapper.PropertiesMapper;
import com.knecon.fforesight.service.layoutparser.processor.services.parsing.PDFLinesTextStripper;
import com.knecon.fforesight.service.layoutparser.processor.services.visualization.LayoutGridService;
import com.knecon.fforesight.service.layoutparser.processor.services.visualization.ViewerDocumentService;
import com.knecon.fforesight.service.layoutparser.server.utils.BuildDocumentTest;
import com.knecon.fforesight.service.layoutparser.server.utils.visualizations.PdfDraw;
import lombok.SneakyThrows;
public class RulingCleaningServiceTest {
public class RulingCleaningServiceTest extends BuildDocumentTest {
@Test
// @Disabled
@SneakyThrows
public void textRulingExtraction() {
String fileName = "files/211.pdf";
String lineFileName = "/tmp/" + Path.of(fileName).getFileName().toString() + "_LINES.pdf";
String fileName = "/files/102 S-Metolachlor_RAR_02_Volume_2_2018-09-06.pdf";
String lineFileName = "C:/Users/YANNIK~1/AppData/Local/Temp/102 S-Metolachlor_RAR_02_Volume_2_2018-09-06.after.pdf";
List<PageContents> pageContents = PageContentExtractor.getSortedPageContents(fileName);
PdfDraw.drawLinesPerPage(fileName, pageContents.stream().map(PageContents::getRulings).toList(), lineFileName);
RulingCleaningService rulingCleaningService = new RulingCleaningService();
List<CleanRulings> cleanRulingsPerPage = new LinkedList<>();
writeJsons(Path.of(fileName));
for (PageContents pageContent : pageContents) {
cleanRulingsPerPage.add(rulingCleaningService.getCleanRulings(Collections.emptyList(), pageContent.getRulings(), 8, 20));
cleanRulingsPerPage.add(rulingCleaningService.getCleanRulings(Collections.emptyList(), pageContent.getRulings(), 8, 1));
}
PdfDraw.drawLinesPerPage(fileName, pageContents.stream().map(PageContents::getRulings).toList(), lineFileName);
}
@Test
@SneakyThrows
public void testTableExtractionSingle() {
String filename ="C:\\Users\\YannikHampe\\repos\\layout-parser\\layoutparser-service\\layoutparser-service-server\\src\\test\\resources\\files\\SinglePages\\24 - SYN549522 - Acute Oral Toxicity - Rats_Page17.pdf";
writeJsons(Path.of(filename));
}
@Test
@SneakyThrows
public void testTableExtraction() {
LayoutGridService layoutGridService = new LayoutGridService();
ViewerDocumentService viewerDocumentService = new ViewerDocumentService(layoutGridService);
ClassPathResource resource = new ClassPathResource("files");
List<String> pdfFileNames = Files.walk(resource.getFile().toPath())
.filter(path -> path.getFileName().toString().endsWith(".pdf"))
.map(Path::toAbsolutePath)
.map(Path::toString)
.toList();
for (int i = 0; i < pdfFileNames.size(); i++) {
writeJsons(Path.of(pdfFileNames.get(i)));
}
}
@SneakyThrows
private void writeJsons(Path filename) {
Document documentGraphBefore = DocumentGraphFactory.buildDocumentGraph(layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER,
Loader.loadPDF(filename.toFile()),
new ImageServiceResponse(),
new TableServiceResponse()));
Document documentGraphAfter = DocumentGraphFactory.buildDocumentGraph(layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER,
Loader.loadPDF(filename.toFile()),
new ImageServiceResponse(),
new TableServiceResponse()));
DocumentData documentDataBefore = DocumentDataMapper.toDocumentData(documentGraphBefore);
DocumentData documentDataAfter = DocumentDataMapper.toDocumentData(documentGraphAfter);
if(!compareStructures(documentDataBefore.getDocumentStructure(), documentDataAfter.getDocumentStructure(), filename.getFileName().toString())) {
String tmpFileNameBefore = "C:/Users/YANNIK~1/AppData/Local/Temp/before."+filename.getFileName().toString();;
System.out.println(tmpFileNameBefore);
try (PDDocument pdDocument = Loader.loadPDF(filename.toFile())) {
PdfDraw.drawDocumentGraph(pdDocument, documentGraphBefore);
pdDocument.save(tmpFileNameBefore);
}
String tmpFileNameAfter = "C:/Users/YANNIK~1/AppData/Local/Temp/after."+filename.getFileName().toString();;
System.out.println(tmpFileNameAfter);
try (PDDocument pdDocument = Loader.loadPDF(filename.toFile())) {
PdfDraw.drawDocumentGraph(pdDocument, documentGraphAfter);
pdDocument.save(tmpFileNameAfter);
}
}
}
@SneakyThrows
private boolean compareStructures(DocumentStructure structure1, DocumentStructure structure2, String pdfName) {
List listStructure1 = structure1
.streamAllEntries()
.filter(entryData -> entryData.getType().equals(NodeType.TABLE))
.map(DocumentStructure.EntryData::getProperties)
.map(properties -> {
var builder = Table.builder();
PropertiesMapper.parseTableProperties(properties, builder);
return builder.build();
}).toList();
List listStructure2 = structure2
.streamAllEntries()
.filter(entryData -> entryData.getType().equals(NodeType.TABLE))
.map(DocumentStructure.EntryData::getProperties)
.map(properties -> {
var builder = Table.builder();
PropertiesMapper.parseTableProperties(properties, builder);
return builder.build();
}).toList();
for(int i = 0; i < listStructure1.size(); i++) {
Table tableNode1 = (Table) listStructure1.get(i);
Table tableNode2 = (Table) listStructure2.get(i);
if(tableNode1.getNumberOfRows() != tableNode2.getNumberOfRows() || tableNode1.getNumberOfCols() != tableNode2.getNumberOfCols()) {
return false;
}
}
return true;
}
}