RED-3816: Bugfix with adding images to sections

This commit is contained in:
Philipp Schramm 2022-05-30 13:52:57 +02:00
parent 77584c9a5a
commit c1192ceefd
3 changed files with 88 additions and 43 deletions

View File

@ -713,19 +713,22 @@ public class Section {
for (SectionArea sectionArea : sectionAreas) { for (SectionArea sectionArea : sectionAreas) {
RedRectangle2D position = RedRectangle2D.builder() RedRectangle2D position = RedRectangle2D.builder()
.height(sectionArea.getHeight()) .height(sectionArea.getHeight() + 4)
.width(sectionArea.getWidth()) .width(sectionArea.getWidth() + 4)
.x(sectionArea.getTopLeft().getX()) .x(sectionArea.getTopLeft().getX() - 2)
.y(sectionArea.getTopLeft().getY()) .y(sectionArea.getTopLeft().getY() - 2)
.build(); .build();
log.debug("SectionArea: {}", sectionArea);
log.debug("Position {}", position.toString());
Image image = Image.builder() Image image = Image.builder()
.page(sectionArea.getPage()) .page(sectionArea.getPage())
.position(position) .position(position)
.redaction(true) .redaction(true)
.hasTransparency(false) .hasTransparency(false)
.sectionNumber(sectionNumber) .sectionNumber(sectionNumber)
.section(sectionArea.getHeader()) .section(headline)
.matchedRule(ruleNumber) .matchedRule(ruleNumber)
.legalBasis(legalBasis) .legalBasis(legalBasis)
.redactionReason(reason) .redactionReason(reason)

View File

@ -127,6 +127,7 @@ public class EntityRedactionService {
})); }));
} }
log.debug("Section {}, Images: {}", reanalysisSection.getSectionNumber(), reanalysisSection.getImages());
sectionSearchableTextPairs.add(new SectionSearchableTextPair(Section.builder() sectionSearchableTextPairs.add(new SectionSearchableTextPair(Section.builder()
.isLocal(false) .isLocal(false)

View File

@ -1,16 +1,31 @@
package com.iqser.red.service.redaction.v1.server.segmentation; package com.iqser.red.service.redaction.v1.server.segmentation;
import com.iqser.red.service.redaction.v1.server.classification.model.*; import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.stream.Collectors;
import org.apache.commons.collections4.CollectionUtils;
import org.springframework.stereotype.Service;
import com.iqser.red.service.redaction.v1.server.classification.model.Document;
import com.iqser.red.service.redaction.v1.server.classification.model.Footer;
import com.iqser.red.service.redaction.v1.server.classification.model.Header;
import com.iqser.red.service.redaction.v1.server.classification.model.Page;
import com.iqser.red.service.redaction.v1.server.classification.model.Paragraph;
import com.iqser.red.service.redaction.v1.server.classification.model.TextBlock;
import com.iqser.red.service.redaction.v1.server.classification.model.UnclassifiedText;
import com.iqser.red.service.redaction.v1.server.redaction.model.PdfImage; import com.iqser.red.service.redaction.v1.server.redaction.model.PdfImage;
import com.iqser.red.service.redaction.v1.server.tableextraction.model.AbstractTextContainer; import com.iqser.red.service.redaction.v1.server.tableextraction.model.AbstractTextContainer;
import com.iqser.red.service.redaction.v1.server.tableextraction.model.Cell; import com.iqser.red.service.redaction.v1.server.tableextraction.model.Cell;
import com.iqser.red.service.redaction.v1.server.tableextraction.model.Table; import com.iqser.red.service.redaction.v1.server.tableextraction.model.Table;
import org.apache.commons.collections4.CollectionUtils;
import org.springframework.stereotype.Service;
import java.util.*; import lombok.extern.slf4j.Slf4j;
import java.util.stream.Collectors;
@Slf4j
@Service @Service
public class SectionsBuilderService { public class SectionsBuilderService {
@ -53,8 +68,7 @@ public class SectionsBuilderService {
continue; continue;
} }
if (prev != null && current.getClassification().startsWith("H ") && !prev.getClassification() if (prev != null && current.getClassification().startsWith("H ") && !prev.getClassification().startsWith("H ") || !document.isHeadlines()) {
.startsWith("H ") || !document.isHeadlines()) {
Paragraph chunkBlock = buildTextBlock(chunkWords, lastHeadline); Paragraph chunkBlock = buildTextBlock(chunkWords, lastHeadline);
chunkBlock.setHeadline(lastHeadline); chunkBlock.setHeadline(lastHeadline);
if (document.isHeadlines()) { if (document.isHeadlines()) {
@ -100,17 +114,19 @@ public class SectionsBuilderService {
public void addImagesToSections(Document document) { public void addImagesToSections(Document document) {
Map<Integer, SortedSet<Paragraph>> paragraphMap = new HashMap<>(); Map<Integer, List<Paragraph>> paragraphMap = new HashMap<>();
for (Paragraph paragraph : document.getParagraphs()) { for (Paragraph paragraph : document.getParagraphs()) {
for (AbstractTextContainer container : paragraph.getPageBlocks()) { for (AbstractTextContainer container : paragraph.getPageBlocks()) {
paragraphMap.computeIfAbsent(container.getPage(), x -> new TreeSet<>()).add(paragraph);
paragraphMap.computeIfAbsent(container.getPage(), c -> new ArrayList<>()).add(paragraph);
} }
} }
if (paragraphMap.isEmpty()) { if (paragraphMap.isEmpty()) {
Paragraph paragraph = new Paragraph(); Paragraph paragraph = new Paragraph();
document.getParagraphs().add(paragraph); document.getParagraphs().add(paragraph);
paragraphMap.computeIfAbsent(1, x -> new TreeSet<>()).add(paragraph); paragraphMap.computeIfAbsent(1, x -> new ArrayList<>()).add(paragraph);
} }
// first page is always a paragraph, else we can't process pages 1..N, // first page is always a paragraph, else we can't process pages 1..N,
@ -118,12 +134,12 @@ public class SectionsBuilderService {
if (paragraphMap.get(1) == null) { if (paragraphMap.get(1) == null) {
Paragraph paragraph = new Paragraph(); Paragraph paragraph = new Paragraph();
document.getParagraphs().add(paragraph); document.getParagraphs().add(paragraph);
paragraphMap.computeIfAbsent(1, x -> new TreeSet<>()).add(paragraph); paragraphMap.computeIfAbsent(1, x -> new ArrayList<>()).add(paragraph);
} }
for (Page page : document.getPages()) { for (Page page : document.getPages()) {
for (PdfImage image : page.getImages()) { for (PdfImage image : page.getImages()) {
SortedSet<Paragraph> paragraphsOnPage = paragraphMap.get(page.getPageNumber()); List<Paragraph> paragraphsOnPage = paragraphMap.get(page.getPageNumber());
if (paragraphsOnPage == null) { if (paragraphsOnPage == null) {
int i = page.getPageNumber(); int i = page.getPageNumber();
while (paragraphsOnPage == null) { while (paragraphsOnPage == null) {
@ -131,27 +147,63 @@ public class SectionsBuilderService {
i--; i--;
} }
} }
Float perviousEnd = 0f;
for (Paragraph paragraph : paragraphsOnPage) { for (Paragraph paragraph : paragraphsOnPage) {
Float currentEnd = 0f; Float xMin = null;
Float yMin = null;
Float xMax = null;
Float yMax = null;
for (AbstractTextContainer abs : paragraph.getPageBlocks()) { for (AbstractTextContainer abs : paragraph.getPageBlocks()) {
if (abs.getPage() != page.getPageNumber()) { if (abs.getPage() != page.getPageNumber()) {
continue; continue;
} }
if (abs.getMaxY() > currentEnd) {
currentEnd = abs.getMaxY(); if (abs.getMinX() < abs.getMaxX()) {
if (xMin == null || abs.getMinX() < xMin) {
xMin = abs.getMinX();
}
if (xMax == null || abs.getMaxX() > xMax) {
xMax = abs.getMaxX();
}
} else {
if (xMin == null || abs.getMaxX() < xMin) {
xMin = abs.getMaxX();
}
if (xMax == null || abs.getMinX() > xMax) {
xMax = abs.getMinX();
} }
} }
if (image.getPosition().getY() >= perviousEnd && image.getPosition().getY() <= currentEnd) { if (abs.getMinY() < abs.getMaxY()) {
if (yMin == null || abs.getMinY() < yMin) {
yMin = abs.getMinY();
}
if (yMax == null || abs.getMaxY() > yMax) {
yMax = abs.getMaxY();
}
} else {
if (yMin == null || abs.getMaxY() < yMin) {
yMin = abs.getMaxY();
}
if (yMax == null || abs.getMinY() > yMax) {
yMax = abs.getMinY();
}
}
}
log.debug("Image position x: {}, y: {}", image.getPosition().getX(), image.getPosition().getY());
log.debug("Paragraph position xMin: {}, xMax: {}, yMin: {}, yMax: {}", xMin, xMax, yMin, yMax);
if (xMin != null && xMax != null && yMin != null && yMax != null && image.getPosition().getX() >= xMin && image.getPosition()
.getX() <= xMax && image.getPosition().getY() >= yMin && image.getPosition().getY() <= yMax) {
paragraph.getImages().add(image); paragraph.getImages().add(image);
image.setAppendedToParagraph(true); image.setAppendedToParagraph(true);
} }
perviousEnd = currentEnd;
} }
if (!image.isAppendedToParagraph()) { if (!image.isAppendedToParagraph()) {
paragraphsOnPage.first().getImages().add(image); log.debug("Image uses first paragraph");
paragraphsOnPage.get(0).getImages().add(image);
image.setAppendedToParagraph(true); image.setAppendedToParagraph(true);
} }
} }
@ -166,9 +218,7 @@ public class SectionsBuilderService {
List<Cell> previousTableNonHeaderRow = getRowWithNonHeaderCells(previousTable); List<Cell> previousTableNonHeaderRow = getRowWithNonHeaderCells(previousTable);
List<Cell> tableNonHeaderRow = getRowWithNonHeaderCells(currentTable); List<Cell> tableNonHeaderRow = getRowWithNonHeaderCells(currentTable);
// Allow merging of tables if header row is separated from first logical non-header row // Allow merging of tables if header row is separated from first logical non-header row
if (previousTableNonHeaderRow.isEmpty() && previousTable.getRowCount() == 1 && previousTable.getRows() if (previousTableNonHeaderRow.isEmpty() && previousTable.getRowCount() == 1 && previousTable.getRows().get(0).size() == tableNonHeaderRow.size()) {
.get(0)
.size() == tableNonHeaderRow.size()) {
previousTableNonHeaderRow = previousTable.getRows().get(0).stream().map(cell -> { previousTableNonHeaderRow = previousTable.getRows().get(0).stream().map(cell -> {
Cell fakeCell = new Cell(cell.getPoints()[0], cell.getPoints()[2]); Cell fakeCell = new Cell(cell.getPoints()[0], cell.getPoints()[2]);
fakeCell.setHeaderCells(Collections.singletonList(cell)); fakeCell.setHeaderCells(Collections.singletonList(cell));
@ -178,8 +228,7 @@ public class SectionsBuilderService {
if (previousTableNonHeaderRow.size() == tableNonHeaderRow.size()) { if (previousTableNonHeaderRow.size() == tableNonHeaderRow.size()) {
for (int i = currentTable.getRowCount() - 1; i >= 0; i--) { // Non header rows are most likely at bottom of table for (int i = currentTable.getRowCount() - 1; i >= 0; i--) { // Non header rows are most likely at bottom of table
List<Cell> row = currentTable.getRows().get(i); List<Cell> row = currentTable.getRows().get(i);
if (row.size() == tableNonHeaderRow.size() && row.stream() if (row.size() == tableNonHeaderRow.size() && row.stream().allMatch(cell -> cell.getHeaderCells().isEmpty())) {
.allMatch(cell -> cell.getHeaderCells().isEmpty())) {
for (int j = 0; j < row.size(); j++) { for (int j = 0; j < row.size(); j++) {
row.get(j).setHeaderCells(previousTableNonHeaderRow.get(j).getHeaderCells()); row.get(j).setHeaderCells(previousTableNonHeaderRow.get(j).getHeaderCells());
} }
@ -229,24 +278,20 @@ public class SectionsBuilderService {
TextBlock wordBlock = (TextBlock) container; TextBlock wordBlock = (TextBlock) container;
if (textBlock == null) { if (textBlock == null) {
textBlock = new TextBlock(wordBlock.getMinX(), wordBlock.getMaxX(), wordBlock.getMinY(), wordBlock.getMaxY(), wordBlock textBlock = new TextBlock(wordBlock.getMinX(), wordBlock.getMaxX(), wordBlock.getMinY(), wordBlock.getMaxY(), wordBlock.getSequences(), wordBlock.getRotation());
.getSequences(), wordBlock.getRotation());
textBlock.setPage(wordBlock.getPage()); textBlock.setPage(wordBlock.getPage());
} else if (splitByTable) { } else if (splitByTable) {
textBlock = new TextBlock(wordBlock.getMinX(), wordBlock.getMaxX(), wordBlock.getMinY(), wordBlock.getMaxY(), wordBlock textBlock = new TextBlock(wordBlock.getMinX(), wordBlock.getMaxX(), wordBlock.getMinY(), wordBlock.getMaxY(), wordBlock.getSequences(), wordBlock.getRotation());
.getSequences(), wordBlock.getRotation());
textBlock.setPage(wordBlock.getPage()); textBlock.setPage(wordBlock.getPage());
alreadyAdded = false; alreadyAdded = false;
} else if (pageBefore != -1 && wordBlock.getPage() != pageBefore) { } else if (pageBefore != -1 && wordBlock.getPage() != pageBefore) {
textBlock.setPage(pageBefore); textBlock.setPage(pageBefore);
paragraph.getPageBlocks().add(textBlock); paragraph.getPageBlocks().add(textBlock);
textBlock = new TextBlock(wordBlock.getMinX(), wordBlock.getMaxX(), wordBlock.getMinY(), wordBlock.getMaxY(), wordBlock textBlock = new TextBlock(wordBlock.getMinX(), wordBlock.getMaxX(), wordBlock.getMinY(), wordBlock.getMaxY(), wordBlock.getSequences(), wordBlock.getRotation());
.getSequences(), wordBlock.getRotation());
textBlock.setPage(wordBlock.getPage()); textBlock.setPage(wordBlock.getPage());
} else { } else {
TextBlock spatialEntity = textBlock.union(wordBlock); TextBlock spatialEntity = textBlock.union(wordBlock);
textBlock.resize(spatialEntity.getMinX(), spatialEntity.getMinY(), spatialEntity.getWidth(), spatialEntity textBlock.resize(spatialEntity.getMinX(), spatialEntity.getMinY(), spatialEntity.getWidth(), spatialEntity.getHeight());
.getHeight());
} }
pageBefore = wordBlock.getPage(); pageBefore = wordBlock.getPage();
splitByTable = false; splitByTable = false;
@ -268,11 +313,7 @@ public class SectionsBuilderService {
private boolean hasInvalidHeaderInformation(Table table) { private boolean hasInvalidHeaderInformation(Table table) {
return table.getRows() return table.getRows().stream().flatMap(row -> row.stream().filter(cell -> CollectionUtils.isNotEmpty(cell.getHeaderCells()))).findAny().isEmpty();
.stream()
.flatMap(row -> row.stream().filter(cell -> CollectionUtils.isNotEmpty(cell.getHeaderCells())))
.findAny()
.isEmpty();
} }