* fixed bug with incorrect empty cell count by adding threshhold to cell.contains
This commit is contained in:
parent
f69331e7d8
commit
c3e69b2cdf
@ -190,14 +190,14 @@ public class LayoutParsingPipeline {
|
||||
PDRectangle cropbox = pdPage.getCropBox();
|
||||
CleanRulings cleanRulings = rulingCleaningService.getCleanRulings(pdfTableCells.get(pageNumber),
|
||||
stripper.getRulings(),
|
||||
1,
|
||||
1);
|
||||
stripper.getMinCharWidth(),
|
||||
stripper.getMaxCharHeight());
|
||||
|
||||
List<Rectangle> spreedSheetArea = tableExtractionService.getSpreadSheetArea(cleanRulings, layoutParsingType);
|
||||
|
||||
Map<String,Float> newValues = calculateMinCharWidthAndMaxCharHeightInsideTable(stripper,spreedSheetArea,10f,1f);
|
||||
Map<String,Float> newValues = calculateMinCharWidthAndMaxCharHeightInsideTable(stripper,spreedSheetArea);
|
||||
|
||||
cleanRulings = rulingCleaningService.getCleanRulings(pdfTableCells.get(pageNumber), stripper.getRulings(), newValues.get("minCharWidth"), newValues.get("maxCharHeight"));
|
||||
cleanRulings = rulingCleaningService.getCleanRulings(pdfTableCells.get(pageNumber), stripper.getRulings(), newValues.get("minCharWidth"), newValues.get("minCharHeigth"));
|
||||
|
||||
|
||||
ClassificationPage classificationPage = switch (layoutParsingType) {
|
||||
@ -255,17 +255,17 @@ public class LayoutParsingPipeline {
|
||||
* @return Map with both values
|
||||
*/
|
||||
|
||||
private Map<String, Float> calculateMinCharWidthAndMaxCharHeightInsideTable(PDFLinesTextStripper stripper, List<Rectangle> spreedSheetArea, float initialMinCharWidth, float initialMaxCharHeight) {
|
||||
private Map<String, Float> calculateMinCharWidthAndMaxCharHeightInsideTable(PDFLinesTextStripper stripper, List<Rectangle> spreedSheetArea) {
|
||||
|
||||
float newMinCharWidth = initialMinCharWidth;
|
||||
float newMaxCharHeight = initialMaxCharHeight;
|
||||
float newMinCharWidth = 10;
|
||||
float newMinCharHeight = 30;
|
||||
Map<String,Float> result = new HashMap<>();
|
||||
for(var textPositionSequence: stripper.getTextPositionSequences() ) {
|
||||
for(var redTextPosition: textPositionSequence.getTextPositions()) {
|
||||
for(var area: spreedSheetArea) {
|
||||
if(area.contains(redTextPosition.getPosition()[0], redTextPosition.getPosition()[1], redTextPosition.getPosition()[2], redTextPosition.getPosition()[3])) {
|
||||
if(redTextPosition.getHeightDir() > newMaxCharHeight) {
|
||||
newMaxCharHeight = redTextPosition.getHeightDir();
|
||||
if(redTextPosition.getHeightDir() < newMinCharHeight) {
|
||||
newMinCharHeight = redTextPosition.getHeightDir();
|
||||
}
|
||||
if(redTextPosition.getWidthDirAdj() < newMinCharWidth) {
|
||||
newMinCharWidth = redTextPosition.getWidthDirAdj();
|
||||
@ -275,7 +275,7 @@ public class LayoutParsingPipeline {
|
||||
}
|
||||
}
|
||||
result.put("minCharWidth",newMinCharWidth);
|
||||
result.put("maxCharHeight",newMaxCharHeight);
|
||||
result.put("minCharHeigth",newMinCharHeight);
|
||||
return result;
|
||||
}
|
||||
|
||||
|
||||
@ -35,6 +35,7 @@ public class Table implements SemanticNode {
|
||||
int numberOfRows;
|
||||
int numberOfCols;
|
||||
int firstpage;
|
||||
int emptyCells;
|
||||
TextBlock textBlock;
|
||||
|
||||
@Builder.Default
|
||||
@ -208,7 +209,6 @@ public class Table implements SemanticNode {
|
||||
return IntStream.range(0, numberOfCols).boxed().map(col -> getCell(row, col));
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Streams all TableCells row-wise and filters them with header == true.
|
||||
*
|
||||
|
||||
@ -252,7 +252,7 @@ public class TablePageBlock extends AbstractPageBlock {
|
||||
if (prevY != null && prevX != null) {
|
||||
var cell = new Cell(new Point2D.Float(prevX, prevY), new Point2D.Float(x, y));
|
||||
|
||||
var intersectionCell = cells.stream().filter(c -> cell.intersects(c) && cell.overlapRatio(c) > 0.1f).findFirst();
|
||||
var intersectionCell = cells.stream().filter(c -> cell.intersects(c)).findFirst();
|
||||
intersectionCell.ifPresent(value -> cell.getTextBlocks().addAll(value.getTextBlocks()));
|
||||
if (cell.hasMinimumSize()) {
|
||||
row.add(cell);
|
||||
|
||||
@ -1,6 +1,9 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.services;
|
||||
|
||||
import java.awt.geom.Point2D;
|
||||
import java.io.FileOutputStream;
|
||||
import java.io.IOException;
|
||||
import java.nio.file.Path;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Comparator;
|
||||
import java.util.HashMap;
|
||||
@ -10,6 +13,8 @@ import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
|
||||
import org.apache.pdfbox.Loader;
|
||||
import org.springframework.core.io.ClassPathResource;
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType;
|
||||
@ -66,6 +71,20 @@ public class TableExtractionService {
|
||||
};
|
||||
|
||||
|
||||
public boolean contains(Cell cell, double x, double y, double w, double h) {
|
||||
if (cell.isEmpty() || w <= 0 || h <= 0) {
|
||||
return false;
|
||||
}
|
||||
double x0 = cell.getX();
|
||||
double y0 = cell.getY();
|
||||
return (x >= x0-2 &&
|
||||
y >= y0-2 &&
|
||||
(x + w) <= x0 + cell.getWidth()+2 &&
|
||||
(y + h) <= y0 + cell.getHeight()+2);
|
||||
}
|
||||
|
||||
|
||||
|
||||
/**
|
||||
* Finds tables on a page and moves textblocks into cells of the found tables.
|
||||
* Note: This algorithm uses Pdf Coordinate System where {0,0} rotated with the page rotation.
|
||||
@ -84,13 +103,12 @@ public class TableExtractionService {
|
||||
List<Cell> cells = findCells(cleanRulings.getHorizontal(), cleanRulings.getVertical(), layoutParsingType);
|
||||
|
||||
|
||||
|
||||
List<TextPageBlock> toBeRemoved = new ArrayList<>();
|
||||
|
||||
for (AbstractPageBlock abstractPageBlock : page.getTextBlocks()) {
|
||||
TextPageBlock textBlock = (TextPageBlock) abstractPageBlock;
|
||||
for (Cell cell : cells) {
|
||||
if (cell.hasMinimumSize() && cell.intersects(textBlock.getPdfMinX(),
|
||||
if (cell.hasMinimumSize() && contains(cell, textBlock.getPdfMinX(),
|
||||
textBlock.getPdfMinY(),
|
||||
textBlock.getPdfMaxX() - textBlock.getPdfMinX(),
|
||||
textBlock.getPdfMaxY() - textBlock.getPdfMinY())) {
|
||||
@ -104,7 +122,7 @@ public class TableExtractionService {
|
||||
cells = new ArrayList<>(new HashSet<>(cells));
|
||||
DoubleComparisons.sort(cells, Rectangle.ILL_DEFINED_ORDER);
|
||||
|
||||
List<Rectangle> spreadsheetAreas = findSpreadsheetsFromCells(cells).stream().filter(r -> r.getWidth() > 0f && r.getHeight() > 0f).toList();
|
||||
List<Rectangle> spreadsheetAreas = findSpreadsheetsFromCells(cells);
|
||||
|
||||
List<TablePageBlock> tables = new ArrayList<>();
|
||||
for (Rectangle area : spreadsheetAreas) {
|
||||
@ -131,9 +149,18 @@ public class TableExtractionService {
|
||||
if (position != -1) {
|
||||
page.getTextBlocks().add(position, table);
|
||||
}
|
||||
|
||||
String tmpFileName = "C:/Users/YANNIK~1/AppData/Local/Temp/page1.tables.html";
|
||||
try (FileOutputStream fileOutputStream = new FileOutputStream(Path.of(tmpFileName).toFile())) {
|
||||
fileOutputStream.write(table.getTextAsHtml().getBytes());
|
||||
}
|
||||
catch (IOException e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
}
|
||||
|
||||
page.getTextBlocks().removeAll(toBeRemoved);
|
||||
|
||||
}
|
||||
|
||||
public List<Rectangle> getSpreadSheetArea(CleanRulings cleanRulings, LayoutParsingType layoutParsingType) {
|
||||
|
||||
@ -2,14 +2,31 @@ package com.knecon.fforesight.service.layoutparser.server.graph;
|
||||
|
||||
import java.io.FileOutputStream;
|
||||
import java.nio.file.Path;
|
||||
import java.util.List;
|
||||
|
||||
import org.apache.pdfbox.Loader;
|
||||
import org.apache.pdfbox.pdmodel.PDDocument;
|
||||
import org.junit.jupiter.api.Disabled;
|
||||
import org.junit.jupiter.api.Test;
|
||||
import org.springframework.beans.factory.annotation.Autowired;
|
||||
import org.springframework.core.io.ClassPathResource;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentData;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentStructure;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeType;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationDocument;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Table;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.Cell;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.TablePageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.image.ImageServiceResponse;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableServiceResponse;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.SectionsBuilderService;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.classification.RedactManagerClassificationService;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.factory.DocumentGraphFactory;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.mapper.DocumentDataMapper;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.mapper.PropertiesMapper;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.visualization.LayoutGridService;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.visualization.ViewerDocumentService;
|
||||
import com.knecon.fforesight.service.layoutparser.server.utils.BuildDocumentTest;
|
||||
@ -18,19 +35,80 @@ import lombok.SneakyThrows;
|
||||
|
||||
public class ViewerDocumentTest extends BuildDocumentTest {
|
||||
|
||||
@Autowired
|
||||
private SectionsBuilderService sectionsBuilderService;
|
||||
|
||||
@Autowired
|
||||
private RedactManagerClassificationService redactManagerClassificationService;
|
||||
|
||||
@Test
|
||||
@Disabled
|
||||
@SneakyThrows
|
||||
public void testViewerDocument() {
|
||||
|
||||
String fileName = "files/2Tables.pdf";
|
||||
String tmpFileName = "C:/Users/YANNIK~1/AppData/Local/Temp/2Tables.lines.pdf";
|
||||
LayoutGridService layoutGridService = new LayoutGridService();
|
||||
ViewerDocumentService viewerDocumentService = new ViewerDocumentService(layoutGridService);
|
||||
String fileName = "files/bdr/notMergedParagraphs.pdf";
|
||||
Document document = buildGraph(fileName, LayoutParsingType.TAAS);
|
||||
String tmpFileName = "/tmp/" + Path.of(fileName).getFileName() + "_VIEWER.pdf";
|
||||
try (var pdDocument = Loader.loadPDF(new ClassPathResource(fileName).getFile()); var out = new FileOutputStream(tmpFileName)) {
|
||||
viewerDocumentService.createViewerDocument(pdDocument, document, out, true);
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
@SneakyThrows
|
||||
public void testTableViewerDocument() {
|
||||
|
||||
String fileName = "C:\\Users\\YannikHampe\\repos\\layout-parser\\layoutparser-service\\layoutparser-service-server\\src\\test\\resources\\files\\SinglePages\\VV-931175_Page1.pdf";
|
||||
String tmpFileName = "C:/Users/YANNIK~1/AppData/Local/Temp/page1.lines.pdf";
|
||||
Document documentGraph = DocumentGraphFactory.buildDocumentGraph(layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER,
|
||||
Loader.loadPDF(Path.of(fileName).toFile()),
|
||||
new ImageServiceResponse(),
|
||||
new TableServiceResponse()));
|
||||
LayoutGridService layoutGridService = new LayoutGridService();
|
||||
ViewerDocumentService viewerDocumentService = new ViewerDocumentService(layoutGridService);
|
||||
try (var pdDocument = Loader.loadPDF(Path.of(fileName).toFile()); var out = new FileOutputStream(tmpFileName)) {
|
||||
viewerDocumentService.createViewerDocument(pdDocument, documentGraph, out, true);
|
||||
}
|
||||
//durch rows
|
||||
DocumentData documentData = DocumentDataMapper.toDocumentData(documentGraph);
|
||||
int emptyCellCount = 0;
|
||||
List listStructure2 = documentData.getDocumentStructure()
|
||||
.streamAllEntries()
|
||||
.filter(entryData -> entryData.getType().equals(NodeType.TABLE))
|
||||
.map(DocumentStructure.EntryData::getProperties)
|
||||
.map(properties -> {
|
||||
var builder = Table.builder();
|
||||
PropertiesMapper.parseTableProperties(properties, builder);
|
||||
return builder.build();
|
||||
}).toList();
|
||||
for(int i = 0; i < listStructure2.size(); i++) {
|
||||
emptyCellCount = ((Table) listStructure2.get(i)).getEmptyCells();
|
||||
}
|
||||
|
||||
System.out.println("Empty cells "+emptyCellCount);
|
||||
|
||||
ClassificationDocument document = buildClassificationDocument(Loader.loadPDF(Path.of(fileName).toFile()));
|
||||
TablePageBlock table = document.getSections().stream().flatMap(paragraph -> paragraph.getTables().stream()).toList().get(0);
|
||||
int emptyCellsFoundFound = table.getRows().stream().flatMap(List::stream).toList().stream().filter(f -> f.toString().equals("")).toList().size();
|
||||
for (List<Cell> row : table.getRows()) {
|
||||
System.out.println(row.toString());
|
||||
}
|
||||
System.out.println("Actual number of empty rows: "+emptyCellsFoundFound);
|
||||
}
|
||||
|
||||
public ClassificationDocument buildClassificationDocument(PDDocument originDocument) {
|
||||
|
||||
ClassificationDocument classificationDocument = layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER,
|
||||
originDocument,
|
||||
new ImageServiceResponse(),
|
||||
new TableServiceResponse());
|
||||
|
||||
redactManagerClassificationService.classifyDocument(classificationDocument);
|
||||
|
||||
sectionsBuilderService.buildSections(classificationDocument);
|
||||
|
||||
return classificationDocument;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@ -29,6 +29,8 @@ import java.util.stream.Collectors;
|
||||
|
||||
import static org.assertj.core.api.Assertions.assertThat;
|
||||
|
||||
import javax.sound.midi.SysexMessage;
|
||||
|
||||
public class PdfSegmentationServiceTest extends AbstractTest {
|
||||
|
||||
@Autowired
|
||||
@ -52,7 +54,7 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
||||
|
||||
public ClassificationDocument buildClassificationDocument(PDDocument originDocument) {
|
||||
|
||||
ClassificationDocument classificationDocument = layoutParsingPipeline.parseLayout(LayoutParsingType.DOCUMINE,
|
||||
ClassificationDocument classificationDocument = layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER,
|
||||
originDocument,
|
||||
new ImageServiceResponse(),
|
||||
new TableServiceResponse());
|
||||
@ -166,8 +168,8 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
||||
|
||||
validateTable(document, 0, 1, 1, 0, 0);
|
||||
validateTable(document, 1, 2, 2, 0, 0);
|
||||
validateTable(document, 2, 7, 20, 0, 140);
|
||||
validateTable(document, 3, 8, 31, 0, 170);
|
||||
validateTable(document, 2, 7, 20, 0, 0);
|
||||
validateTable(document, 3, 8, 31, 0, 0);
|
||||
|
||||
}
|
||||
|
||||
@ -181,7 +183,7 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
||||
|
||||
validateTableSize(document, 1);
|
||||
|
||||
validateTable(document, 0, 8, 8, 0, 2);
|
||||
validateTable(document, 0, 8, 8, 0, 0);
|
||||
|
||||
List<List<String>> values = Arrays.asList(Arrays.asList("Annex point Reference within DAR/RAR",
|
||||
"Author, date",
|
||||
@ -191,18 +193,18 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
||||
"Method meets analytical validation criteria",
|
||||
"Remarks (in case validation criteria are not met)",
|
||||
"Acceptability of the method"),
|
||||
Arrays.asList("",
|
||||
Arrays.asList("Part (a) Methods in soil, water, sediment, air and any additional matrices used in support of environmental fate studies",
|
||||
"Part (a) Methods in soil, water, sediment, air and any additional matrices used in support of environmental fate studies",
|
||||
"Part (a) Methods in soil, water, sediment, air and any additional matrices used in support of environmental fate studies",
|
||||
"Part (a) Methods in soil, water, sediment, air and any additional matrices used in support of environmental fate studies",
|
||||
"Part (a) Methods in soil, water, sediment, air and any additional matrices used in support of environmental fate studies",
|
||||
"Part (a) Methods in soil, water, sediment, air and any additional matrices used in support of environmental fate studies",
|
||||
"",
|
||||
"Part (a) Methods in soil, water, sediment, air and any additional matrices used in support of environmental fate studies",
|
||||
"Part (a) Methods in soil, water, sediment, air and any additional matrices used in support of environmental fate studies"),
|
||||
Arrays.asList("CA 7.1.2.1.1 DAR (2009)",
|
||||
"Evans P.G. 2001 TMJ4569B, VV-323245",
|
||||
"Azoxystrobin Laboratory Degradation Study in Three Soil Types, Sampled from Holland and the United Kingdom",
|
||||
"Method: RAM 269 Johnson R.I., Tummon O.J., Earl M. 1995 RJ1864B, VV-377731 Johnson R.I., Tummon O.J., Earl M. 1998 RAM 269/02, VV-124072 Johnson R.I., Tummon O.J., Earl M. 2000 RAM 269/03, VV-123986 Validation: Robinson N.J. 2001 TMJ4617B, VV-895845 in a Trial Carried",
|
||||
"Method: RAM 269 Johnson R.I., Tummon O.J., Earl M. 1995 RJ1864B, VV-377731 Johnson R.I., Tummon O.J., Earl M. 1998 RAM 269/02, VV-124072 Johnson R.I., Tummon O.J., Earl M. 2000 RAM 269/03, VV-123986 Validation: Robinson N.J. 2001 TMJ4617B, VV-895845",
|
||||
"LC-MS/MS LOQ: 0.01 mg/kg (R401553 (SYN50165 7), R402173 (SYN501114 )) or 0.02 mg/kg (azoxystrobin, R230310, R234886) Working range: 0.02-1.0 or 0.01-0.5 mg/kg (depending on analyte) Other supporting quantificati on methods: HPLC-UV GC-MSD",
|
||||
"Y",
|
||||
"N/A",
|
||||
@ -239,8 +241,8 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
||||
|
||||
validateTableSize(document, 2);
|
||||
|
||||
validateTable(document, 0, 5, 5, 0, 23);
|
||||
validateTable(document, 1, 11, 9, 0, 36);
|
||||
validateTable(document, 0, 5, 5, 0, 0);
|
||||
validateTable(document, 1, 11, 9, 0, 0);
|
||||
|
||||
}
|
||||
|
||||
@ -328,7 +330,7 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
||||
ClassificationDocument document = buildClassificationDocument(Loader.loadPDF(pdfFileResource.getFile()));
|
||||
|
||||
validateTableSize(document, 1);
|
||||
validateTable(document, 0, 10, 6, 0, 1);
|
||||
validateTable(document, 0, 10, 6, 0, 0);
|
||||
|
||||
}
|
||||
|
||||
@ -450,8 +452,8 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
||||
|
||||
validateTableSize(document, 2);
|
||||
|
||||
validateTable(document, 0, 6, 8, 0, 2);
|
||||
validateTable(document, 1, 6, 8, 0, 1);
|
||||
validateTable(document, 0, 6, 8, 0, 0);
|
||||
validateTable(document, 1, 6, 8, 0, 0);
|
||||
|
||||
}
|
||||
|
||||
@ -465,7 +467,7 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
||||
|
||||
validateTableSize(document, 1);
|
||||
|
||||
validateTable(document, 0, 9, 5, 2, 0);
|
||||
validateTable(document, 0, 9, 5, 0, 0);
|
||||
|
||||
}
|
||||
|
||||
@ -490,6 +492,9 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
||||
List<List<Cell>> rows = table.getRows();
|
||||
int emptyCellsFoundFound = rows.stream().flatMap(List::stream).toList().stream().filter(f -> f.toString().equals("")).toList().size();
|
||||
|
||||
for (List<Cell> row : table.getRows()) {
|
||||
row.forEach(r -> System.out.println(r.toString()));
|
||||
}
|
||||
assertThat(emptyCellsFoundFound).isEqualTo(emptyCellsCountCorrect + emptyCellsCountIncorrect);
|
||||
|
||||
assertThat(table.getColCount()).isEqualTo(colCount);
|
||||
|
||||
@ -1,37 +1,159 @@
|
||||
package com.knecon.fforesight.service.layoutparser.server.services;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.FileOutputStream;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.nio.file.Paths;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.Collections;
|
||||
import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
|
||||
import org.junit.jupiter.api.Test;
|
||||
import javax.print.Doc;
|
||||
|
||||
import org.apache.pdfbox.Loader;
|
||||
import org.apache.pdfbox.pdmodel.PDDocument;
|
||||
import org.junit.jupiter.api.Test;
|
||||
import org.springframework.core.io.ClassPathResource;
|
||||
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
import com.iqser.red.commons.jackson.ObjectMapperFactory;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentData;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentStructure;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeType;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.PageContents;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Table;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.image.ImageServiceResponse;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableServiceResponse;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.PageContentExtractor;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.RulingCleaningService;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.factory.DocumentGraphFactory;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.mapper.DocumentDataMapper;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.mapper.PropertiesMapper;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.parsing.PDFLinesTextStripper;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.visualization.LayoutGridService;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.visualization.ViewerDocumentService;
|
||||
import com.knecon.fforesight.service.layoutparser.server.utils.BuildDocumentTest;
|
||||
import com.knecon.fforesight.service.layoutparser.server.utils.visualizations.PdfDraw;
|
||||
|
||||
import lombok.SneakyThrows;
|
||||
|
||||
public class RulingCleaningServiceTest {
|
||||
public class RulingCleaningServiceTest extends BuildDocumentTest {
|
||||
|
||||
@Test
|
||||
// @Disabled
|
||||
@SneakyThrows
|
||||
public void textRulingExtraction() {
|
||||
|
||||
String fileName = "files/211.pdf";
|
||||
String lineFileName = "/tmp/" + Path.of(fileName).getFileName().toString() + "_LINES.pdf";
|
||||
String fileName = "/files/102 S-Metolachlor_RAR_02_Volume_2_2018-09-06.pdf";
|
||||
String lineFileName = "C:/Users/YANNIK~1/AppData/Local/Temp/102 S-Metolachlor_RAR_02_Volume_2_2018-09-06.after.pdf";
|
||||
List<PageContents> pageContents = PageContentExtractor.getSortedPageContents(fileName);
|
||||
PdfDraw.drawLinesPerPage(fileName, pageContents.stream().map(PageContents::getRulings).toList(), lineFileName);
|
||||
|
||||
RulingCleaningService rulingCleaningService = new RulingCleaningService();
|
||||
List<CleanRulings> cleanRulingsPerPage = new LinkedList<>();
|
||||
writeJsons(Path.of(fileName));
|
||||
for (PageContents pageContent : pageContents) {
|
||||
cleanRulingsPerPage.add(rulingCleaningService.getCleanRulings(Collections.emptyList(), pageContent.getRulings(), 8, 20));
|
||||
cleanRulingsPerPage.add(rulingCleaningService.getCleanRulings(Collections.emptyList(), pageContent.getRulings(), 8, 1));
|
||||
}
|
||||
PdfDraw.drawLinesPerPage(fileName, pageContents.stream().map(PageContents::getRulings).toList(), lineFileName);
|
||||
|
||||
|
||||
}
|
||||
|
||||
@Test
|
||||
@SneakyThrows
|
||||
public void testTableExtractionSingle() {
|
||||
String filename ="C:\\Users\\YannikHampe\\repos\\layout-parser\\layoutparser-service\\layoutparser-service-server\\src\\test\\resources\\files\\SinglePages\\24 - SYN549522 - Acute Oral Toxicity - Rats_Page17.pdf";
|
||||
writeJsons(Path.of(filename));
|
||||
|
||||
}
|
||||
|
||||
@Test
|
||||
@SneakyThrows
|
||||
public void testTableExtraction() {
|
||||
|
||||
|
||||
LayoutGridService layoutGridService = new LayoutGridService();
|
||||
ViewerDocumentService viewerDocumentService = new ViewerDocumentService(layoutGridService);
|
||||
|
||||
ClassPathResource resource = new ClassPathResource("files");
|
||||
List<String> pdfFileNames = Files.walk(resource.getFile().toPath())
|
||||
.filter(path -> path.getFileName().toString().endsWith(".pdf"))
|
||||
.map(Path::toAbsolutePath)
|
||||
.map(Path::toString)
|
||||
.toList();
|
||||
|
||||
for (int i = 0; i < pdfFileNames.size(); i++) {
|
||||
writeJsons(Path.of(pdfFileNames.get(i)));
|
||||
}
|
||||
}
|
||||
|
||||
@SneakyThrows
|
||||
private void writeJsons(Path filename) {
|
||||
|
||||
Document documentGraphBefore = DocumentGraphFactory.buildDocumentGraph(layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER,
|
||||
Loader.loadPDF(filename.toFile()),
|
||||
new ImageServiceResponse(),
|
||||
new TableServiceResponse()));
|
||||
Document documentGraphAfter = DocumentGraphFactory.buildDocumentGraph(layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER,
|
||||
Loader.loadPDF(filename.toFile()),
|
||||
new ImageServiceResponse(),
|
||||
new TableServiceResponse()));
|
||||
DocumentData documentDataBefore = DocumentDataMapper.toDocumentData(documentGraphBefore);
|
||||
DocumentData documentDataAfter = DocumentDataMapper.toDocumentData(documentGraphAfter);
|
||||
if(!compareStructures(documentDataBefore.getDocumentStructure(), documentDataAfter.getDocumentStructure(), filename.getFileName().toString())) {
|
||||
String tmpFileNameBefore = "C:/Users/YANNIK~1/AppData/Local/Temp/before."+filename.getFileName().toString();;
|
||||
System.out.println(tmpFileNameBefore);
|
||||
try (PDDocument pdDocument = Loader.loadPDF(filename.toFile())) {
|
||||
PdfDraw.drawDocumentGraph(pdDocument, documentGraphBefore);
|
||||
pdDocument.save(tmpFileNameBefore);
|
||||
}
|
||||
String tmpFileNameAfter = "C:/Users/YANNIK~1/AppData/Local/Temp/after."+filename.getFileName().toString();;
|
||||
System.out.println(tmpFileNameAfter);
|
||||
try (PDDocument pdDocument = Loader.loadPDF(filename.toFile())) {
|
||||
PdfDraw.drawDocumentGraph(pdDocument, documentGraphAfter);
|
||||
pdDocument.save(tmpFileNameAfter);
|
||||
|
||||
}
|
||||
}
|
||||
}
|
||||
@SneakyThrows
|
||||
private boolean compareStructures(DocumentStructure structure1, DocumentStructure structure2, String pdfName) {
|
||||
|
||||
|
||||
List listStructure1 = structure1
|
||||
.streamAllEntries()
|
||||
.filter(entryData -> entryData.getType().equals(NodeType.TABLE))
|
||||
.map(DocumentStructure.EntryData::getProperties)
|
||||
.map(properties -> {
|
||||
var builder = Table.builder();
|
||||
PropertiesMapper.parseTableProperties(properties, builder);
|
||||
return builder.build();
|
||||
}).toList();
|
||||
|
||||
List listStructure2 = structure2
|
||||
.streamAllEntries()
|
||||
.filter(entryData -> entryData.getType().equals(NodeType.TABLE))
|
||||
.map(DocumentStructure.EntryData::getProperties)
|
||||
.map(properties -> {
|
||||
var builder = Table.builder();
|
||||
PropertiesMapper.parseTableProperties(properties, builder);
|
||||
return builder.build();
|
||||
}).toList();
|
||||
|
||||
|
||||
for(int i = 0; i < listStructure1.size(); i++) {
|
||||
Table tableNode1 = (Table) listStructure1.get(i);
|
||||
Table tableNode2 = (Table) listStructure2.get(i);
|
||||
if(tableNode1.getNumberOfRows() != tableNode2.getNumberOfRows() || tableNode1.getNumberOfCols() != tableNode2.getNumberOfCols()) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user