diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/Section.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/Section.java index 815d1a2f..10df1632 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/Section.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/model/Section.java @@ -713,19 +713,22 @@ public class Section { for (SectionArea sectionArea : sectionAreas) { RedRectangle2D position = RedRectangle2D.builder() - .height(sectionArea.getHeight()) - .width(sectionArea.getWidth()) - .x(sectionArea.getTopLeft().getX()) - .y(sectionArea.getTopLeft().getY()) + .height(sectionArea.getHeight() + 4) + .width(sectionArea.getWidth() + 4) + .x(sectionArea.getTopLeft().getX() - 2) + .y(sectionArea.getTopLeft().getY() - 2) .build(); + log.debug("SectionArea: {}", sectionArea); + log.debug("Position {}", position.toString()); + Image image = Image.builder() .page(sectionArea.getPage()) .position(position) .redaction(true) .hasTransparency(false) .sectionNumber(sectionNumber) - .section(sectionArea.getHeader()) + .section(headline) .matchedRule(ruleNumber) .legalBasis(legalBasis) .redactionReason(reason) diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/service/EntityRedactionService.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/service/EntityRedactionService.java index 4d292591..eeaec117 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/service/EntityRedactionService.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/redaction/service/EntityRedactionService.java @@ -127,6 +127,7 @@ public class EntityRedactionService { })); } + log.debug("Section {}, Images: {}", reanalysisSection.getSectionNumber(), reanalysisSection.getImages()); sectionSearchableTextPairs.add(new SectionSearchableTextPair(Section.builder() .isLocal(false) diff --git a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/segmentation/SectionsBuilderService.java b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/segmentation/SectionsBuilderService.java index 59cf79d7..f3401ec4 100644 --- a/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/segmentation/SectionsBuilderService.java +++ b/redaction-service-v1/redaction-service-server-v1/src/main/java/com/iqser/red/service/redaction/v1/server/segmentation/SectionsBuilderService.java @@ -1,16 +1,31 @@ package com.iqser.red.service.redaction.v1.server.segmentation; -import com.iqser.red.service.redaction.v1.server.classification.model.*; +import java.util.ArrayList; +import java.util.Collections; +import java.util.HashMap; +import java.util.Iterator; +import java.util.List; +import java.util.Map; +import java.util.stream.Collectors; + +import org.apache.commons.collections4.CollectionUtils; +import org.springframework.stereotype.Service; + +import com.iqser.red.service.redaction.v1.server.classification.model.Document; +import com.iqser.red.service.redaction.v1.server.classification.model.Footer; +import com.iqser.red.service.redaction.v1.server.classification.model.Header; +import com.iqser.red.service.redaction.v1.server.classification.model.Page; +import com.iqser.red.service.redaction.v1.server.classification.model.Paragraph; +import com.iqser.red.service.redaction.v1.server.classification.model.TextBlock; +import com.iqser.red.service.redaction.v1.server.classification.model.UnclassifiedText; import com.iqser.red.service.redaction.v1.server.redaction.model.PdfImage; import com.iqser.red.service.redaction.v1.server.tableextraction.model.AbstractTextContainer; import com.iqser.red.service.redaction.v1.server.tableextraction.model.Cell; import com.iqser.red.service.redaction.v1.server.tableextraction.model.Table; -import org.apache.commons.collections4.CollectionUtils; -import org.springframework.stereotype.Service; -import java.util.*; -import java.util.stream.Collectors; +import lombok.extern.slf4j.Slf4j; +@Slf4j @Service public class SectionsBuilderService { @@ -53,8 +68,7 @@ public class SectionsBuilderService { continue; } - if (prev != null && current.getClassification().startsWith("H ") && !prev.getClassification() - .startsWith("H ") || !document.isHeadlines()) { + if (prev != null && current.getClassification().startsWith("H ") && !prev.getClassification().startsWith("H ") || !document.isHeadlines()) { Paragraph chunkBlock = buildTextBlock(chunkWords, lastHeadline); chunkBlock.setHeadline(lastHeadline); if (document.isHeadlines()) { @@ -100,17 +114,19 @@ public class SectionsBuilderService { public void addImagesToSections(Document document) { - Map> paragraphMap = new HashMap<>(); + Map> paragraphMap = new HashMap<>(); for (Paragraph paragraph : document.getParagraphs()) { for (AbstractTextContainer container : paragraph.getPageBlocks()) { - paragraphMap.computeIfAbsent(container.getPage(), x -> new TreeSet<>()).add(paragraph); + + paragraphMap.computeIfAbsent(container.getPage(), c -> new ArrayList<>()).add(paragraph); + } } if (paragraphMap.isEmpty()) { Paragraph paragraph = new Paragraph(); document.getParagraphs().add(paragraph); - paragraphMap.computeIfAbsent(1, x -> new TreeSet<>()).add(paragraph); + paragraphMap.computeIfAbsent(1, x -> new ArrayList<>()).add(paragraph); } // first page is always a paragraph, else we can't process pages 1..N, @@ -118,12 +134,12 @@ public class SectionsBuilderService { if (paragraphMap.get(1) == null) { Paragraph paragraph = new Paragraph(); document.getParagraphs().add(paragraph); - paragraphMap.computeIfAbsent(1, x -> new TreeSet<>()).add(paragraph); + paragraphMap.computeIfAbsent(1, x -> new ArrayList<>()).add(paragraph); } for (Page page : document.getPages()) { for (PdfImage image : page.getImages()) { - SortedSet paragraphsOnPage = paragraphMap.get(page.getPageNumber()); + List paragraphsOnPage = paragraphMap.get(page.getPageNumber()); if (paragraphsOnPage == null) { int i = page.getPageNumber(); while (paragraphsOnPage == null) { @@ -131,27 +147,63 @@ public class SectionsBuilderService { i--; } } - - Float perviousEnd = 0f; for (Paragraph paragraph : paragraphsOnPage) { - Float currentEnd = 0f; + Float xMin = null; + Float yMin = null; + Float xMax = null; + Float yMax = null; + for (AbstractTextContainer abs : paragraph.getPageBlocks()) { if (abs.getPage() != page.getPageNumber()) { continue; } - if (abs.getMaxY() > currentEnd) { - currentEnd = abs.getMaxY(); + + if (abs.getMinX() < abs.getMaxX()) { + if (xMin == null || abs.getMinX() < xMin) { + xMin = abs.getMinX(); + } + if (xMax == null || abs.getMaxX() > xMax) { + xMax = abs.getMaxX(); + } + } else { + if (xMin == null || abs.getMaxX() < xMin) { + xMin = abs.getMaxX(); + } + if (xMax == null || abs.getMinX() > xMax) { + xMax = abs.getMinX(); + } } + + if (abs.getMinY() < abs.getMaxY()) { + if (yMin == null || abs.getMinY() < yMin) { + yMin = abs.getMinY(); + } + if (yMax == null || abs.getMaxY() > yMax) { + yMax = abs.getMaxY(); + } + } else { + if (yMin == null || abs.getMaxY() < yMin) { + yMin = abs.getMaxY(); + } + if (yMax == null || abs.getMinY() > yMax) { + yMax = abs.getMinY(); + } + } + } - if (image.getPosition().getY() >= perviousEnd && image.getPosition().getY() <= currentEnd) { + log.debug("Image position x: {}, y: {}", image.getPosition().getX(), image.getPosition().getY()); + log.debug("Paragraph position xMin: {}, xMax: {}, yMin: {}, yMax: {}", xMin, xMax, yMin, yMax); + + if (xMin != null && xMax != null && yMin != null && yMax != null && image.getPosition().getX() >= xMin && image.getPosition() + .getX() <= xMax && image.getPosition().getY() >= yMin && image.getPosition().getY() <= yMax) { paragraph.getImages().add(image); image.setAppendedToParagraph(true); } - perviousEnd = currentEnd; } if (!image.isAppendedToParagraph()) { - paragraphsOnPage.first().getImages().add(image); + log.debug("Image uses first paragraph"); + paragraphsOnPage.get(0).getImages().add(image); image.setAppendedToParagraph(true); } } @@ -166,9 +218,7 @@ public class SectionsBuilderService { List previousTableNonHeaderRow = getRowWithNonHeaderCells(previousTable); List tableNonHeaderRow = getRowWithNonHeaderCells(currentTable); // Allow merging of tables if header row is separated from first logical non-header row - if (previousTableNonHeaderRow.isEmpty() && previousTable.getRowCount() == 1 && previousTable.getRows() - .get(0) - .size() == tableNonHeaderRow.size()) { + if (previousTableNonHeaderRow.isEmpty() && previousTable.getRowCount() == 1 && previousTable.getRows().get(0).size() == tableNonHeaderRow.size()) { previousTableNonHeaderRow = previousTable.getRows().get(0).stream().map(cell -> { Cell fakeCell = new Cell(cell.getPoints()[0], cell.getPoints()[2]); fakeCell.setHeaderCells(Collections.singletonList(cell)); @@ -178,8 +228,7 @@ public class SectionsBuilderService { if (previousTableNonHeaderRow.size() == tableNonHeaderRow.size()) { for (int i = currentTable.getRowCount() - 1; i >= 0; i--) { // Non header rows are most likely at bottom of table List row = currentTable.getRows().get(i); - if (row.size() == tableNonHeaderRow.size() && row.stream() - .allMatch(cell -> cell.getHeaderCells().isEmpty())) { + if (row.size() == tableNonHeaderRow.size() && row.stream().allMatch(cell -> cell.getHeaderCells().isEmpty())) { for (int j = 0; j < row.size(); j++) { row.get(j).setHeaderCells(previousTableNonHeaderRow.get(j).getHeaderCells()); } @@ -229,24 +278,20 @@ public class SectionsBuilderService { TextBlock wordBlock = (TextBlock) container; if (textBlock == null) { - textBlock = new TextBlock(wordBlock.getMinX(), wordBlock.getMaxX(), wordBlock.getMinY(), wordBlock.getMaxY(), wordBlock - .getSequences(), wordBlock.getRotation()); + textBlock = new TextBlock(wordBlock.getMinX(), wordBlock.getMaxX(), wordBlock.getMinY(), wordBlock.getMaxY(), wordBlock.getSequences(), wordBlock.getRotation()); textBlock.setPage(wordBlock.getPage()); } else if (splitByTable) { - textBlock = new TextBlock(wordBlock.getMinX(), wordBlock.getMaxX(), wordBlock.getMinY(), wordBlock.getMaxY(), wordBlock - .getSequences(), wordBlock.getRotation()); + textBlock = new TextBlock(wordBlock.getMinX(), wordBlock.getMaxX(), wordBlock.getMinY(), wordBlock.getMaxY(), wordBlock.getSequences(), wordBlock.getRotation()); textBlock.setPage(wordBlock.getPage()); alreadyAdded = false; } else if (pageBefore != -1 && wordBlock.getPage() != pageBefore) { textBlock.setPage(pageBefore); paragraph.getPageBlocks().add(textBlock); - textBlock = new TextBlock(wordBlock.getMinX(), wordBlock.getMaxX(), wordBlock.getMinY(), wordBlock.getMaxY(), wordBlock - .getSequences(), wordBlock.getRotation()); + textBlock = new TextBlock(wordBlock.getMinX(), wordBlock.getMaxX(), wordBlock.getMinY(), wordBlock.getMaxY(), wordBlock.getSequences(), wordBlock.getRotation()); textBlock.setPage(wordBlock.getPage()); } else { TextBlock spatialEntity = textBlock.union(wordBlock); - textBlock.resize(spatialEntity.getMinX(), spatialEntity.getMinY(), spatialEntity.getWidth(), spatialEntity - .getHeight()); + textBlock.resize(spatialEntity.getMinX(), spatialEntity.getMinY(), spatialEntity.getWidth(), spatialEntity.getHeight()); } pageBefore = wordBlock.getPage(); splitByTable = false; @@ -268,11 +313,7 @@ public class SectionsBuilderService { private boolean hasInvalidHeaderInformation(Table table) { - return table.getRows() - .stream() - .flatMap(row -> row.stream().filter(cell -> CollectionUtils.isNotEmpty(cell.getHeaderCells()))) - .findAny() - .isEmpty(); + return table.getRows().stream().flatMap(row -> row.stream().filter(cell -> CollectionUtils.isNotEmpty(cell.getHeaderCells()))).findAny().isEmpty(); }