RED-3816: Bugfix with adding images to sections

This commit is contained in:
Philipp Schramm 2022-05-30 13:52:57 +02:00
parent 77584c9a5a
commit c1192ceefd
3 changed files with 88 additions and 43 deletions

View File

@ -713,19 +713,22 @@ public class Section {
for (SectionArea sectionArea : sectionAreas) {
RedRectangle2D position = RedRectangle2D.builder()
.height(sectionArea.getHeight())
.width(sectionArea.getWidth())
.x(sectionArea.getTopLeft().getX())
.y(sectionArea.getTopLeft().getY())
.height(sectionArea.getHeight() + 4)
.width(sectionArea.getWidth() + 4)
.x(sectionArea.getTopLeft().getX() - 2)
.y(sectionArea.getTopLeft().getY() - 2)
.build();
log.debug("SectionArea: {}", sectionArea);
log.debug("Position {}", position.toString());
Image image = Image.builder()
.page(sectionArea.getPage())
.position(position)
.redaction(true)
.hasTransparency(false)
.sectionNumber(sectionNumber)
.section(sectionArea.getHeader())
.section(headline)
.matchedRule(ruleNumber)
.legalBasis(legalBasis)
.redactionReason(reason)

View File

@ -127,6 +127,7 @@ public class EntityRedactionService {
}));
}
log.debug("Section {}, Images: {}", reanalysisSection.getSectionNumber(), reanalysisSection.getImages());
sectionSearchableTextPairs.add(new SectionSearchableTextPair(Section.builder()
.isLocal(false)

View File

@ -1,16 +1,31 @@
package com.iqser.red.service.redaction.v1.server.segmentation;
import com.iqser.red.service.redaction.v1.server.classification.model.*;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.stream.Collectors;
import org.apache.commons.collections4.CollectionUtils;
import org.springframework.stereotype.Service;
import com.iqser.red.service.redaction.v1.server.classification.model.Document;
import com.iqser.red.service.redaction.v1.server.classification.model.Footer;
import com.iqser.red.service.redaction.v1.server.classification.model.Header;
import com.iqser.red.service.redaction.v1.server.classification.model.Page;
import com.iqser.red.service.redaction.v1.server.classification.model.Paragraph;
import com.iqser.red.service.redaction.v1.server.classification.model.TextBlock;
import com.iqser.red.service.redaction.v1.server.classification.model.UnclassifiedText;
import com.iqser.red.service.redaction.v1.server.redaction.model.PdfImage;
import com.iqser.red.service.redaction.v1.server.tableextraction.model.AbstractTextContainer;
import com.iqser.red.service.redaction.v1.server.tableextraction.model.Cell;
import com.iqser.red.service.redaction.v1.server.tableextraction.model.Table;
import org.apache.commons.collections4.CollectionUtils;
import org.springframework.stereotype.Service;
import java.util.*;
import java.util.stream.Collectors;
import lombok.extern.slf4j.Slf4j;
@Slf4j
@Service
public class SectionsBuilderService {
@ -53,8 +68,7 @@ public class SectionsBuilderService {
continue;
}
if (prev != null && current.getClassification().startsWith("H ") && !prev.getClassification()
.startsWith("H ") || !document.isHeadlines()) {
if (prev != null && current.getClassification().startsWith("H ") && !prev.getClassification().startsWith("H ") || !document.isHeadlines()) {
Paragraph chunkBlock = buildTextBlock(chunkWords, lastHeadline);
chunkBlock.setHeadline(lastHeadline);
if (document.isHeadlines()) {
@ -100,17 +114,19 @@ public class SectionsBuilderService {
public void addImagesToSections(Document document) {
Map<Integer, SortedSet<Paragraph>> paragraphMap = new HashMap<>();
Map<Integer, List<Paragraph>> paragraphMap = new HashMap<>();
for (Paragraph paragraph : document.getParagraphs()) {
for (AbstractTextContainer container : paragraph.getPageBlocks()) {
paragraphMap.computeIfAbsent(container.getPage(), x -> new TreeSet<>()).add(paragraph);
paragraphMap.computeIfAbsent(container.getPage(), c -> new ArrayList<>()).add(paragraph);
}
}
if (paragraphMap.isEmpty()) {
Paragraph paragraph = new Paragraph();
document.getParagraphs().add(paragraph);
paragraphMap.computeIfAbsent(1, x -> new TreeSet<>()).add(paragraph);
paragraphMap.computeIfAbsent(1, x -> new ArrayList<>()).add(paragraph);
}
// first page is always a paragraph, else we can't process pages 1..N,
@ -118,12 +134,12 @@ public class SectionsBuilderService {
if (paragraphMap.get(1) == null) {
Paragraph paragraph = new Paragraph();
document.getParagraphs().add(paragraph);
paragraphMap.computeIfAbsent(1, x -> new TreeSet<>()).add(paragraph);
paragraphMap.computeIfAbsent(1, x -> new ArrayList<>()).add(paragraph);
}
for (Page page : document.getPages()) {
for (PdfImage image : page.getImages()) {
SortedSet<Paragraph> paragraphsOnPage = paragraphMap.get(page.getPageNumber());
List<Paragraph> paragraphsOnPage = paragraphMap.get(page.getPageNumber());
if (paragraphsOnPage == null) {
int i = page.getPageNumber();
while (paragraphsOnPage == null) {
@ -131,27 +147,63 @@ public class SectionsBuilderService {
i--;
}
}
Float perviousEnd = 0f;
for (Paragraph paragraph : paragraphsOnPage) {
Float currentEnd = 0f;
Float xMin = null;
Float yMin = null;
Float xMax = null;
Float yMax = null;
for (AbstractTextContainer abs : paragraph.getPageBlocks()) {
if (abs.getPage() != page.getPageNumber()) {
continue;
}
if (abs.getMaxY() > currentEnd) {
currentEnd = abs.getMaxY();
if (abs.getMinX() < abs.getMaxX()) {
if (xMin == null || abs.getMinX() < xMin) {
xMin = abs.getMinX();
}
if (xMax == null || abs.getMaxX() > xMax) {
xMax = abs.getMaxX();
}
} else {
if (xMin == null || abs.getMaxX() < xMin) {
xMin = abs.getMaxX();
}
if (xMax == null || abs.getMinX() > xMax) {
xMax = abs.getMinX();
}
}
if (abs.getMinY() < abs.getMaxY()) {
if (yMin == null || abs.getMinY() < yMin) {
yMin = abs.getMinY();
}
if (yMax == null || abs.getMaxY() > yMax) {
yMax = abs.getMaxY();
}
} else {
if (yMin == null || abs.getMaxY() < yMin) {
yMin = abs.getMaxY();
}
if (yMax == null || abs.getMinY() > yMax) {
yMax = abs.getMinY();
}
}
}
if (image.getPosition().getY() >= perviousEnd && image.getPosition().getY() <= currentEnd) {
log.debug("Image position x: {}, y: {}", image.getPosition().getX(), image.getPosition().getY());
log.debug("Paragraph position xMin: {}, xMax: {}, yMin: {}, yMax: {}", xMin, xMax, yMin, yMax);
if (xMin != null && xMax != null && yMin != null && yMax != null && image.getPosition().getX() >= xMin && image.getPosition()
.getX() <= xMax && image.getPosition().getY() >= yMin && image.getPosition().getY() <= yMax) {
paragraph.getImages().add(image);
image.setAppendedToParagraph(true);
}
perviousEnd = currentEnd;
}
if (!image.isAppendedToParagraph()) {
paragraphsOnPage.first().getImages().add(image);
log.debug("Image uses first paragraph");
paragraphsOnPage.get(0).getImages().add(image);
image.setAppendedToParagraph(true);
}
}
@ -166,9 +218,7 @@ public class SectionsBuilderService {
List<Cell> previousTableNonHeaderRow = getRowWithNonHeaderCells(previousTable);
List<Cell> tableNonHeaderRow = getRowWithNonHeaderCells(currentTable);
// Allow merging of tables if header row is separated from first logical non-header row
if (previousTableNonHeaderRow.isEmpty() && previousTable.getRowCount() == 1 && previousTable.getRows()
.get(0)
.size() == tableNonHeaderRow.size()) {
if (previousTableNonHeaderRow.isEmpty() && previousTable.getRowCount() == 1 && previousTable.getRows().get(0).size() == tableNonHeaderRow.size()) {
previousTableNonHeaderRow = previousTable.getRows().get(0).stream().map(cell -> {
Cell fakeCell = new Cell(cell.getPoints()[0], cell.getPoints()[2]);
fakeCell.setHeaderCells(Collections.singletonList(cell));
@ -178,8 +228,7 @@ public class SectionsBuilderService {
if (previousTableNonHeaderRow.size() == tableNonHeaderRow.size()) {
for (int i = currentTable.getRowCount() - 1; i >= 0; i--) { // Non header rows are most likely at bottom of table
List<Cell> row = currentTable.getRows().get(i);
if (row.size() == tableNonHeaderRow.size() && row.stream()
.allMatch(cell -> cell.getHeaderCells().isEmpty())) {
if (row.size() == tableNonHeaderRow.size() && row.stream().allMatch(cell -> cell.getHeaderCells().isEmpty())) {
for (int j = 0; j < row.size(); j++) {
row.get(j).setHeaderCells(previousTableNonHeaderRow.get(j).getHeaderCells());
}
@ -229,24 +278,20 @@ public class SectionsBuilderService {
TextBlock wordBlock = (TextBlock) container;
if (textBlock == null) {
textBlock = new TextBlock(wordBlock.getMinX(), wordBlock.getMaxX(), wordBlock.getMinY(), wordBlock.getMaxY(), wordBlock
.getSequences(), wordBlock.getRotation());
textBlock = new TextBlock(wordBlock.getMinX(), wordBlock.getMaxX(), wordBlock.getMinY(), wordBlock.getMaxY(), wordBlock.getSequences(), wordBlock.getRotation());
textBlock.setPage(wordBlock.getPage());
} else if (splitByTable) {
textBlock = new TextBlock(wordBlock.getMinX(), wordBlock.getMaxX(), wordBlock.getMinY(), wordBlock.getMaxY(), wordBlock
.getSequences(), wordBlock.getRotation());
textBlock = new TextBlock(wordBlock.getMinX(), wordBlock.getMaxX(), wordBlock.getMinY(), wordBlock.getMaxY(), wordBlock.getSequences(), wordBlock.getRotation());
textBlock.setPage(wordBlock.getPage());
alreadyAdded = false;
} else if (pageBefore != -1 && wordBlock.getPage() != pageBefore) {
textBlock.setPage(pageBefore);
paragraph.getPageBlocks().add(textBlock);
textBlock = new TextBlock(wordBlock.getMinX(), wordBlock.getMaxX(), wordBlock.getMinY(), wordBlock.getMaxY(), wordBlock
.getSequences(), wordBlock.getRotation());
textBlock = new TextBlock(wordBlock.getMinX(), wordBlock.getMaxX(), wordBlock.getMinY(), wordBlock.getMaxY(), wordBlock.getSequences(), wordBlock.getRotation());
textBlock.setPage(wordBlock.getPage());
} else {
TextBlock spatialEntity = textBlock.union(wordBlock);
textBlock.resize(spatialEntity.getMinX(), spatialEntity.getMinY(), spatialEntity.getWidth(), spatialEntity
.getHeight());
textBlock.resize(spatialEntity.getMinX(), spatialEntity.getMinY(), spatialEntity.getWidth(), spatialEntity.getHeight());
}
pageBefore = wordBlock.getPage();
splitByTable = false;
@ -268,11 +313,7 @@ public class SectionsBuilderService {
private boolean hasInvalidHeaderInformation(Table table) {
return table.getRows()
.stream()
.flatMap(row -> row.stream().filter(cell -> CollectionUtils.isNotEmpty(cell.getHeaderCells())))
.findAny()
.isEmpty();
return table.getRows().stream().flatMap(row -> row.stream().filter(cell -> CollectionUtils.isNotEmpty(cell.getHeaderCells()))).findAny().isEmpty();
}