* fixed bug with incorrect empty cell count by adding threshhold to cell.contains

2023-11-15 10:44:47 +01:00 · 2023-11-15 10:44:47 +01:00 · c3e69b2cdf
commit c3e69b2cdf
parent f69331e7d8
7 changed files with 270 additions and 38 deletions
--- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingPipeline.java
+++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingPipeline.java
@ -190,14 +190,14 @@ public class LayoutParsingPipeline {
            PDRectangle cropbox = pdPage.getCropBox();
            CleanRulings cleanRulings = rulingCleaningService.getCleanRulings(pdfTableCells.get(pageNumber),
                    stripper.getRulings(),
-                    1,
+                   stripper.getMinCharWidth(),
-                   1);
+                   stripper.getMaxCharHeight());
            List<Rectangle> spreedSheetArea = tableExtractionService.getSpreadSheetArea(cleanRulings, layoutParsingType);
-            Map<String,Float> newValues = calculateMinCharWidthAndMaxCharHeightInsideTable(stripper,spreedSheetArea,10f,1f);
+            Map<String,Float> newValues = calculateMinCharWidthAndMaxCharHeightInsideTable(stripper,spreedSheetArea);
-            cleanRulings = rulingCleaningService.getCleanRulings(pdfTableCells.get(pageNumber), stripper.getRulings(), newValues.get("minCharWidth"), newValues.get("maxCharHeight"));
+            cleanRulings = rulingCleaningService.getCleanRulings(pdfTableCells.get(pageNumber), stripper.getRulings(), newValues.get("minCharWidth"), newValues.get("minCharHeigth"));
            ClassificationPage classificationPage = switch (layoutParsingType) {
@ -255,17 +255,17 @@ public class LayoutParsingPipeline {
     * @return Map with both values
     */
-    private Map<String, Float> calculateMinCharWidthAndMaxCharHeightInsideTable(PDFLinesTextStripper stripper, List<Rectangle> spreedSheetArea, float initialMinCharWidth, float initialMaxCharHeight) {
+    private Map<String, Float> calculateMinCharWidthAndMaxCharHeightInsideTable(PDFLinesTextStripper stripper, List<Rectangle> spreedSheetArea) {
-        float newMinCharWidth = initialMinCharWidth;
+        float newMinCharWidth = 10;
-        float newMaxCharHeight = initialMaxCharHeight;
+        float newMinCharHeight = 30;
        Map<String,Float> result = new HashMap<>();
        for(var textPositionSequence: stripper.getTextPositionSequences() ) {
            for(var redTextPosition: textPositionSequence.getTextPositions()) {
                for(var area: spreedSheetArea) {
                    if(area.contains(redTextPosition.getPosition()[0], redTextPosition.getPosition()[1], redTextPosition.getPosition()[2], redTextPosition.getPosition()[3])) {
-                        if(redTextPosition.getHeightDir() > newMaxCharHeight) {
+                        if(redTextPosition.getHeightDir() < newMinCharHeight) {
-                            newMaxCharHeight = redTextPosition.getHeightDir();
+                            newMinCharHeight = redTextPosition.getHeightDir();
                        }
                        if(redTextPosition.getWidthDirAdj() < newMinCharWidth)  {
                            newMinCharWidth = redTextPosition.getWidthDirAdj();
@ -275,7 +275,7 @@ public class LayoutParsingPipeline {
            }
        }
        result.put("minCharWidth",newMinCharWidth);
-        result.put("maxCharHeight",newMaxCharHeight);
+        result.put("minCharHeigth",newMinCharHeight);
        return result;
    }
--- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/nodes/Table.java
+++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/graph/nodes/Table.java
@ -35,6 +35,7 @@ public class Table implements SemanticNode {
    int numberOfRows;
    int numberOfCols;
    int firstpage;
    int emptyCells;
    TextBlock textBlock;
    @Builder.Default
@ -208,7 +209,6 @@ public class Table implements SemanticNode {
        return IntStream.range(0, numberOfCols).boxed().map(col -> getCell(row, col));
    }
    /**
     * Streams all TableCells row-wise and filters them with header == true.
     *
--- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/table/TablePageBlock.java
+++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/table/TablePageBlock.java
@ -252,7 +252,7 @@ public class TablePageBlock extends AbstractPageBlock {
                if (prevY != null && prevX != null) {
                    var cell = new Cell(new Point2D.Float(prevX, prevY), new Point2D.Float(x, y));
-                    var intersectionCell = cells.stream().filter(c -> cell.intersects(c) && cell.overlapRatio(c) > 0.1f).findFirst();
+                    var intersectionCell = cells.stream().filter(c -> cell.intersects(c)).findFirst();
                    intersectionCell.ifPresent(value -> cell.getTextBlocks().addAll(value.getTextBlocks()));
                    if (cell.hasMinimumSize()) {
                        row.add(cell);
--- a/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/TableExtractionService.java
+++ b/layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/TableExtractionService.java
@ -1,6 +1,9 @@
 package com.knecon.fforesight.service.layoutparser.processor.services;
 import java.awt.geom.Point2D;
 import java.io.FileOutputStream;
 import java.io.IOException;
 import java.nio.file.Path;
 import java.util.ArrayList;
 import java.util.Comparator;
 import java.util.HashMap;
@ -10,6 +13,8 @@ import java.util.List;
 import java.util.Map;
 import java.util.Set;
 import org.apache.pdfbox.Loader;
 import org.springframework.core.io.ClassPathResource;
 import org.springframework.stereotype.Service;
 import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType;
@ -66,6 +71,20 @@ public class TableExtractionService {
    };
    public boolean contains(Cell cell, double x, double y, double w, double h) {
        if (cell.isEmpty() || w <= 0 || h <= 0) {
            return false;
        }
        double x0 = cell.getX();
        double y0 = cell.getY();
        return (x >= x0-2 &&
                y >= y0-2 &&
                (x + w) <= x0 + cell.getWidth()+2 &&
                (y + h) <= y0 + cell.getHeight()+2);
    }
    /**
     * Finds tables on a page and moves textblocks into cells of the found tables.
     * Note: This algorithm uses Pdf Coordinate System where {0,0} rotated with the page rotation.
@ -84,13 +103,12 @@ public class TableExtractionService {
        List<Cell> cells = findCells(cleanRulings.getHorizontal(), cleanRulings.getVertical(), layoutParsingType);
        List<TextPageBlock> toBeRemoved = new ArrayList<>();
        for (AbstractPageBlock abstractPageBlock : page.getTextBlocks()) {
            TextPageBlock textBlock = (TextPageBlock) abstractPageBlock;
            for (Cell cell : cells) {
-                if (cell.hasMinimumSize() && cell.intersects(textBlock.getPdfMinX(),
+                if (cell.hasMinimumSize() && contains(cell, textBlock.getPdfMinX(),
                        textBlock.getPdfMinY(),
                        textBlock.getPdfMaxX() - textBlock.getPdfMinX(),
                        textBlock.getPdfMaxY() - textBlock.getPdfMinY())) {
@ -104,7 +122,7 @@ public class TableExtractionService {
        cells = new ArrayList<>(new HashSet<>(cells));
        DoubleComparisons.sort(cells, Rectangle.ILL_DEFINED_ORDER);
-        List<Rectangle> spreadsheetAreas = findSpreadsheetsFromCells(cells).stream().filter(r -> r.getWidth() > 0f && r.getHeight() > 0f).toList();
+        List<Rectangle> spreadsheetAreas = findSpreadsheetsFromCells(cells);
        List<TablePageBlock> tables = new ArrayList<>();
        for (Rectangle area : spreadsheetAreas) {
@ -131,9 +149,18 @@ public class TableExtractionService {
            if (position != -1) {
                page.getTextBlocks().add(position, table);
            }
            String tmpFileName = "C:/Users/YANNIK~1/AppData/Local/Temp/page1.tables.html";
            try (FileOutputStream fileOutputStream = new FileOutputStream(Path.of(tmpFileName).toFile())) {
                fileOutputStream.write(table.getTextAsHtml().getBytes());
            }
             catch (IOException e) {
                throw new RuntimeException(e);
            }
        }
        page.getTextBlocks().removeAll(toBeRemoved);
    }
    public List<Rectangle> getSpreadSheetArea(CleanRulings cleanRulings, LayoutParsingType layoutParsingType) {
--- a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/ViewerDocumentTest.java
+++ b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/graph/ViewerDocumentTest.java
@ -2,14 +2,31 @@ package com.knecon.fforesight.service.layoutparser.server.graph;
 import java.io.FileOutputStream;
 import java.nio.file.Path;
 import java.util.List;
 import org.apache.pdfbox.Loader;
 import org.apache.pdfbox.pdmodel.PDDocument;
 import org.junit.jupiter.api.Disabled;
 import org.junit.jupiter.api.Test;
 import org.springframework.beans.factory.annotation.Autowired;
 import org.springframework.core.io.ClassPathResource;
 import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentData;
 import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentStructure;
 import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeType;
 import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType;
 import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationDocument;
 import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document;
 import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Table;
 import com.knecon.fforesight.service.layoutparser.processor.model.table.Cell;
 import com.knecon.fforesight.service.layoutparser.processor.model.table.TablePageBlock;
 import com.knecon.fforesight.service.layoutparser.processor.python_api.model.image.ImageServiceResponse;
 import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableServiceResponse;
 import com.knecon.fforesight.service.layoutparser.processor.services.SectionsBuilderService;
 import com.knecon.fforesight.service.layoutparser.processor.services.classification.RedactManagerClassificationService;
 import com.knecon.fforesight.service.layoutparser.processor.services.factory.DocumentGraphFactory;
 import com.knecon.fforesight.service.layoutparser.processor.services.mapper.DocumentDataMapper;
 import com.knecon.fforesight.service.layoutparser.processor.services.mapper.PropertiesMapper;
 import com.knecon.fforesight.service.layoutparser.processor.services.visualization.LayoutGridService;
 import com.knecon.fforesight.service.layoutparser.processor.services.visualization.ViewerDocumentService;
 import com.knecon.fforesight.service.layoutparser.server.utils.BuildDocumentTest;
@ -18,19 +35,80 @@ import lombok.SneakyThrows;
 public class ViewerDocumentTest extends BuildDocumentTest {
    @Autowired
    private SectionsBuilderService sectionsBuilderService;
    @Autowired
    private RedactManagerClassificationService redactManagerClassificationService;
    @Test
    @Disabled
    @SneakyThrows
    public void testViewerDocument() {
        String fileName = "files/2Tables.pdf";
        String tmpFileName = "C:/Users/YANNIK~1/AppData/Local/Temp/2Tables.lines.pdf";
        LayoutGridService layoutGridService = new LayoutGridService();
        ViewerDocumentService viewerDocumentService = new ViewerDocumentService(layoutGridService);
        String fileName = "files/bdr/notMergedParagraphs.pdf";
        Document document = buildGraph(fileName, LayoutParsingType.TAAS);
        String tmpFileName = "/tmp/" + Path.of(fileName).getFileName() + "_VIEWER.pdf";
        try (var pdDocument = Loader.loadPDF(new ClassPathResource(fileName).getFile()); var out = new FileOutputStream(tmpFileName)) {
            viewerDocumentService.createViewerDocument(pdDocument, document, out, true);
        }
    }
    @Test
    @SneakyThrows
    public void testTableViewerDocument() {
        String fileName = "C:\\Users\\YannikHampe\\repos\\layout-parser\\layoutparser-service\\layoutparser-service-server\\src\\test\\resources\\files\\SinglePages\\VV-931175_Page1.pdf";
        String tmpFileName = "C:/Users/YANNIK~1/AppData/Local/Temp/page1.lines.pdf";
        Document documentGraph = DocumentGraphFactory.buildDocumentGraph(layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER,
                Loader.loadPDF(Path.of(fileName).toFile()),
                new ImageServiceResponse(),
                new TableServiceResponse()));
        LayoutGridService layoutGridService = new LayoutGridService();
        ViewerDocumentService viewerDocumentService = new ViewerDocumentService(layoutGridService);
        try (var pdDocument = Loader.loadPDF(Path.of(fileName).toFile()); var out = new FileOutputStream(tmpFileName)) {
            viewerDocumentService.createViewerDocument(pdDocument, documentGraph, out, true);
        }
        //durch rows
        DocumentData documentData = DocumentDataMapper.toDocumentData(documentGraph);
        int emptyCellCount = 0;
        List listStructure2 = documentData.getDocumentStructure()
                .streamAllEntries()
                .filter(entryData -> entryData.getType().equals(NodeType.TABLE))
                .map(DocumentStructure.EntryData::getProperties)
                .map(properties -> {
                    var builder = Table.builder();
                    PropertiesMapper.parseTableProperties(properties, builder);
                    return builder.build();
                }).toList();
        for(int i = 0; i < listStructure2.size(); i++) {
            emptyCellCount = ((Table) listStructure2.get(i)).getEmptyCells();
        }
        System.out.println("Empty cells "+emptyCellCount);
        ClassificationDocument document = buildClassificationDocument(Loader.loadPDF(Path.of(fileName).toFile()));
        TablePageBlock table = document.getSections().stream().flatMap(paragraph -> paragraph.getTables().stream()).toList().get(0);
        int emptyCellsFoundFound = table.getRows().stream().flatMap(List::stream).toList().stream().filter(f -> f.toString().equals("")).toList().size();
        for (List<Cell> row : table.getRows()) {
            System.out.println(row.toString());
        }
        System.out.println("Actual number of empty rows: "+emptyCellsFoundFound);
    }
    public ClassificationDocument buildClassificationDocument(PDDocument originDocument) {
        ClassificationDocument classificationDocument = layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER,
                originDocument,
                new ImageServiceResponse(),
                new TableServiceResponse());
        redactManagerClassificationService.classifyDocument(classificationDocument);
        sectionsBuilderService.buildSections(classificationDocument);
        return classificationDocument;
    }
 }
--- a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/segmentation/PdfSegmentationServiceTest.java
+++ b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/segmentation/PdfSegmentationServiceTest.java
@ -29,6 +29,8 @@ import java.util.stream.Collectors;
 import static org.assertj.core.api.Assertions.assertThat;
 import javax.sound.midi.SysexMessage;
 public class PdfSegmentationServiceTest extends AbstractTest {
    @Autowired
@ -52,7 +54,7 @@ public class PdfSegmentationServiceTest extends AbstractTest {
    public ClassificationDocument buildClassificationDocument(PDDocument originDocument) {
-        ClassificationDocument classificationDocument = layoutParsingPipeline.parseLayout(LayoutParsingType.DOCUMINE,
+        ClassificationDocument classificationDocument = layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER,
                originDocument,
                new ImageServiceResponse(),
                new TableServiceResponse());
@ -166,8 +168,8 @@ public class PdfSegmentationServiceTest extends AbstractTest {
        validateTable(document, 0, 1, 1, 0, 0);
        validateTable(document, 1, 2, 2, 0, 0);
-        validateTable(document, 2, 7, 20, 0, 140);
+        validateTable(document, 2, 7, 20, 0, 0);
-        validateTable(document, 3, 8, 31, 0, 170);
+        validateTable(document, 3, 8, 31, 0, 0);
    }
@ -181,7 +183,7 @@ public class PdfSegmentationServiceTest extends AbstractTest {
        validateTableSize(document, 1);
-        validateTable(document, 0, 8, 8, 0, 2);
+        validateTable(document, 0, 8, 8, 0, 0);
        List<List<String>> values = Arrays.asList(Arrays.asList("Annex point Reference within DAR/RAR",
                        "Author, date",
@ -191,18 +193,18 @@ public class PdfSegmentationServiceTest extends AbstractTest {
                        "Method meets analytical validation criteria",
                        "Remarks (in case validation criteria are not met)",
                        "Acceptability of the method"),
-                Arrays.asList("",
+                Arrays.asList("Part (a) Methods in soil, water, sediment, air and any additional matrices used in support of environmental fate studies",
                        "Part (a) Methods in soil, water, sediment, air and any additional matrices used in support of environmental fate studies",
                        "Part (a) Methods in soil, water, sediment, air and any additional matrices used in support of environmental fate studies",
                        "Part (a) Methods in soil, water, sediment, air and any additional matrices used in support of environmental fate studies",
                        "Part (a) Methods in soil, water, sediment, air and any additional matrices used in support of environmental fate studies",
                        "Part (a) Methods in soil, water, sediment, air and any additional matrices used in support of environmental fate studies",
                        "",
                        "Part (a) Methods in soil, water, sediment, air and any additional matrices used in support of environmental fate studies",
                        "Part (a) Methods in soil, water, sediment, air and any additional matrices used in support of environmental fate studies"),
                Arrays.asList("CA 7.1.2.1.1 DAR (2009)",
                        "Evans P.G. 2001 TMJ4569B, VV-323245",
                        "Azoxystrobin Laboratory Degradation Study in Three Soil Types, Sampled from Holland and the United Kingdom",
-                        "Method: RAM 269 Johnson R.I., Tummon O.J., Earl M. 1995 RJ1864B, VV-377731 Johnson R.I., Tummon O.J., Earl M. 1998 RAM 269/02, VV-124072 Johnson R.I., Tummon O.J., Earl M. 2000 RAM 269/03, VV-123986 Validation: Robinson N.J. 2001 TMJ4617B, VV-895845 in a Trial Carried",
+                        "Method: RAM 269 Johnson R.I., Tummon O.J., Earl M. 1995 RJ1864B, VV-377731 Johnson R.I., Tummon O.J., Earl M. 1998 RAM 269/02, VV-124072 Johnson R.I., Tummon O.J., Earl M. 2000 RAM 269/03, VV-123986 Validation: Robinson N.J. 2001 TMJ4617B, VV-895845",
                        "LC-MS/MS LOQ: 0.01 mg/kg (R401553 (SYN50165 7), R402173 (SYN501114 )) or 0.02 mg/kg (azoxystrobin, R230310, R234886) Working range: 0.02-1.0 or 0.01-0.5 mg/kg (depending on analyte) Other supporting quantificati on methods: HPLC-UV GC-MSD",
                        "Y",
                        "N/A",
@ -239,8 +241,8 @@ public class PdfSegmentationServiceTest extends AbstractTest {
        validateTableSize(document, 2);
-        validateTable(document, 0, 5, 5, 0, 23);
+        validateTable(document, 0, 5, 5, 0, 0);
-        validateTable(document, 1, 11, 9, 0, 36);
+        validateTable(document, 1, 11, 9, 0, 0);
    }
@ -328,7 +330,7 @@ public class PdfSegmentationServiceTest extends AbstractTest {
        ClassificationDocument document = buildClassificationDocument(Loader.loadPDF(pdfFileResource.getFile()));
        validateTableSize(document, 1);
-        validateTable(document, 0, 10, 6, 0, 1);
+        validateTable(document, 0, 10, 6, 0, 0);
    }
@ -450,8 +452,8 @@ public class PdfSegmentationServiceTest extends AbstractTest {
        validateTableSize(document, 2);
-        validateTable(document, 0, 6, 8, 0, 2);
+        validateTable(document, 0, 6, 8, 0, 0);
-        validateTable(document, 1, 6, 8, 0, 1);
+        validateTable(document, 1, 6, 8, 0, 0);
    }
@ -465,7 +467,7 @@ public class PdfSegmentationServiceTest extends AbstractTest {
        validateTableSize(document, 1);
-        validateTable(document, 0, 9, 5, 2, 0);
+        validateTable(document, 0, 9, 5, 0, 0);
    }
@ -490,6 +492,9 @@ public class PdfSegmentationServiceTest extends AbstractTest {
        List<List<Cell>> rows = table.getRows();
        int emptyCellsFoundFound = rows.stream().flatMap(List::stream).toList().stream().filter(f -> f.toString().equals("")).toList().size();
        for (List<Cell> row : table.getRows()) {
            row.forEach(r -> System.out.println(r.toString()));
        }
        assertThat(emptyCellsFoundFound).isEqualTo(emptyCellsCountCorrect + emptyCellsCountIncorrect);
        assertThat(table.getColCount()).isEqualTo(colCount);
--- a/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/services/RulingCleaningServiceTest.java
+++ b/layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/services/RulingCleaningServiceTest.java
@ -1,37 +1,159 @@
 package com.knecon.fforesight.service.layoutparser.server.services;
 import java.io.File;
 import java.io.FileOutputStream;
 import java.nio.file.Files;
 import java.nio.file.Path;
 import java.nio.file.Paths;
 import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.Collections;
 import java.util.LinkedList;
 import java.util.List;
-import org.junit.jupiter.api.Test;
+import javax.print.Doc;
 import org.apache.pdfbox.Loader;
 import org.apache.pdfbox.pdmodel.PDDocument;
 import org.junit.jupiter.api.Test;
 import org.springframework.core.io.ClassPathResource;
 import com.fasterxml.jackson.databind.ObjectMapper;
 import com.iqser.red.commons.jackson.ObjectMapperFactory;
 import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentData;
 import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentStructure;
 import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeType;
 import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType;
 import com.knecon.fforesight.service.layoutparser.processor.model.PageContents;
 import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document;
 import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Table;
 import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings;
 import com.knecon.fforesight.service.layoutparser.processor.python_api.model.image.ImageServiceResponse;
 import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableServiceResponse;
 import com.knecon.fforesight.service.layoutparser.processor.services.PageContentExtractor;
 import com.knecon.fforesight.service.layoutparser.processor.services.RulingCleaningService;
 import com.knecon.fforesight.service.layoutparser.processor.services.factory.DocumentGraphFactory;
 import com.knecon.fforesight.service.layoutparser.processor.services.mapper.DocumentDataMapper;
 import com.knecon.fforesight.service.layoutparser.processor.services.mapper.PropertiesMapper;
 import com.knecon.fforesight.service.layoutparser.processor.services.parsing.PDFLinesTextStripper;
 import com.knecon.fforesight.service.layoutparser.processor.services.visualization.LayoutGridService;
 import com.knecon.fforesight.service.layoutparser.processor.services.visualization.ViewerDocumentService;
 import com.knecon.fforesight.service.layoutparser.server.utils.BuildDocumentTest;
 import com.knecon.fforesight.service.layoutparser.server.utils.visualizations.PdfDraw;
 import lombok.SneakyThrows;
-public class RulingCleaningServiceTest {
+public class RulingCleaningServiceTest extends BuildDocumentTest {
    @Test
 //    @Disabled
    @SneakyThrows
    public void textRulingExtraction() {
-        String fileName = "files/211.pdf";
+        String fileName = "/files/102 S-Metolachlor_RAR_02_Volume_2_2018-09-06.pdf";
-        String lineFileName = "/tmp/" + Path.of(fileName).getFileName().toString() + "_LINES.pdf";
+        String lineFileName = "C:/Users/YANNIK~1/AppData/Local/Temp/102 S-Metolachlor_RAR_02_Volume_2_2018-09-06.after.pdf";
        List<PageContents> pageContents = PageContentExtractor.getSortedPageContents(fileName);
        PdfDraw.drawLinesPerPage(fileName, pageContents.stream().map(PageContents::getRulings).toList(), lineFileName);
        RulingCleaningService rulingCleaningService = new RulingCleaningService();
        List<CleanRulings> cleanRulingsPerPage = new LinkedList<>();
        writeJsons(Path.of(fileName));
        for (PageContents pageContent : pageContents) {
-            cleanRulingsPerPage.add(rulingCleaningService.getCleanRulings(Collections.emptyList(), pageContent.getRulings(), 8, 20));
+            cleanRulingsPerPage.add(rulingCleaningService.getCleanRulings(Collections.emptyList(), pageContent.getRulings(), 8, 1));
        }
        PdfDraw.drawLinesPerPage(fileName, pageContents.stream().map(PageContents::getRulings).toList(), lineFileName);
    }
    @Test
    @SneakyThrows
    public void testTableExtractionSingle() {
        String filename ="C:\\Users\\YannikHampe\\repos\\layout-parser\\layoutparser-service\\layoutparser-service-server\\src\\test\\resources\\files\\SinglePages\\24 - SYN549522 - Acute Oral Toxicity - Rats_Page17.pdf";
        writeJsons(Path.of(filename));
    }
    @Test
    @SneakyThrows
    public void testTableExtraction() {
        LayoutGridService layoutGridService = new LayoutGridService();
        ViewerDocumentService viewerDocumentService = new ViewerDocumentService(layoutGridService);
        ClassPathResource resource = new ClassPathResource("files");
        List<String> pdfFileNames = Files.walk(resource.getFile().toPath())
                .filter(path -> path.getFileName().toString().endsWith(".pdf"))
                .map(Path::toAbsolutePath)
                .map(Path::toString)
                .toList();
        for (int i = 0; i < pdfFileNames.size(); i++) {
            writeJsons(Path.of(pdfFileNames.get(i)));
        }
    }
    @SneakyThrows
    private void writeJsons(Path filename) {
            Document documentGraphBefore = DocumentGraphFactory.buildDocumentGraph(layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER,
                    Loader.loadPDF(filename.toFile()),
                    new ImageServiceResponse(),
                    new TableServiceResponse()));
            Document documentGraphAfter = DocumentGraphFactory.buildDocumentGraph(layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER,
                    Loader.loadPDF(filename.toFile()),
                    new ImageServiceResponse(),
                    new TableServiceResponse()));
        DocumentData documentDataBefore = DocumentDataMapper.toDocumentData(documentGraphBefore);
        DocumentData documentDataAfter = DocumentDataMapper.toDocumentData(documentGraphAfter);
            if(!compareStructures(documentDataBefore.getDocumentStructure(), documentDataAfter.getDocumentStructure(), filename.getFileName().toString())) {
                String tmpFileNameBefore = "C:/Users/YANNIK~1/AppData/Local/Temp/before."+filename.getFileName().toString();;
                System.out.println(tmpFileNameBefore);
                try (PDDocument pdDocument = Loader.loadPDF(filename.toFile())) {
                PdfDraw.drawDocumentGraph(pdDocument, documentGraphBefore);
                pdDocument.save(tmpFileNameBefore);
                }
                 String tmpFileNameAfter = "C:/Users/YANNIK~1/AppData/Local/Temp/after."+filename.getFileName().toString();;
                System.out.println(tmpFileNameAfter);
                try (PDDocument pdDocument = Loader.loadPDF(filename.toFile())) {
                  PdfDraw.drawDocumentGraph(pdDocument, documentGraphAfter);
                  pdDocument.save(tmpFileNameAfter);
                }
            }
    }
    @SneakyThrows
    private boolean compareStructures(DocumentStructure structure1, DocumentStructure structure2, String pdfName) {
        List listStructure1 = structure1
                .streamAllEntries()
                .filter(entryData -> entryData.getType().equals(NodeType.TABLE))
                .map(DocumentStructure.EntryData::getProperties)
                .map(properties -> {
                    var builder = Table.builder();
                    PropertiesMapper.parseTableProperties(properties, builder);
                    return builder.build();
                }).toList();
        List listStructure2 = structure2
                .streamAllEntries()
                .filter(entryData -> entryData.getType().equals(NodeType.TABLE))
                .map(DocumentStructure.EntryData::getProperties)
                .map(properties -> {
                    var builder = Table.builder();
                    PropertiesMapper.parseTableProperties(properties, builder);
                    return builder.build();
                }).toList();
        for(int i = 0; i < listStructure1.size(); i++) {
            Table tableNode1 = (Table) listStructure1.get(i);
            Table tableNode2 = (Table) listStructure2.get(i);
            if(tableNode1.getNumberOfRows() != tableNode2.getNumberOfRows() || tableNode1.getNumberOfCols() != tableNode2.getNumberOfCols()) {
                return false;
            }
        }
        return true;
    }
 }