RED-3816: Bugfix with adding images to sections
This commit is contained in:
parent
77584c9a5a
commit
c1192ceefd
@ -713,19 +713,22 @@ public class Section {
|
||||
for (SectionArea sectionArea : sectionAreas) {
|
||||
|
||||
RedRectangle2D position = RedRectangle2D.builder()
|
||||
.height(sectionArea.getHeight())
|
||||
.width(sectionArea.getWidth())
|
||||
.x(sectionArea.getTopLeft().getX())
|
||||
.y(sectionArea.getTopLeft().getY())
|
||||
.height(sectionArea.getHeight() + 4)
|
||||
.width(sectionArea.getWidth() + 4)
|
||||
.x(sectionArea.getTopLeft().getX() - 2)
|
||||
.y(sectionArea.getTopLeft().getY() - 2)
|
||||
.build();
|
||||
|
||||
log.debug("SectionArea: {}", sectionArea);
|
||||
log.debug("Position {}", position.toString());
|
||||
|
||||
Image image = Image.builder()
|
||||
.page(sectionArea.getPage())
|
||||
.position(position)
|
||||
.redaction(true)
|
||||
.hasTransparency(false)
|
||||
.sectionNumber(sectionNumber)
|
||||
.section(sectionArea.getHeader())
|
||||
.section(headline)
|
||||
.matchedRule(ruleNumber)
|
||||
.legalBasis(legalBasis)
|
||||
.redactionReason(reason)
|
||||
|
||||
@ -127,6 +127,7 @@ public class EntityRedactionService {
|
||||
}));
|
||||
}
|
||||
|
||||
log.debug("Section {}, Images: {}", reanalysisSection.getSectionNumber(), reanalysisSection.getImages());
|
||||
|
||||
sectionSearchableTextPairs.add(new SectionSearchableTextPair(Section.builder()
|
||||
.isLocal(false)
|
||||
|
||||
@ -1,16 +1,31 @@
|
||||
package com.iqser.red.service.redaction.v1.server.segmentation;
|
||||
|
||||
import com.iqser.red.service.redaction.v1.server.classification.model.*;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collections;
|
||||
import java.util.HashMap;
|
||||
import java.util.Iterator;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import org.apache.commons.collections4.CollectionUtils;
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import com.iqser.red.service.redaction.v1.server.classification.model.Document;
|
||||
import com.iqser.red.service.redaction.v1.server.classification.model.Footer;
|
||||
import com.iqser.red.service.redaction.v1.server.classification.model.Header;
|
||||
import com.iqser.red.service.redaction.v1.server.classification.model.Page;
|
||||
import com.iqser.red.service.redaction.v1.server.classification.model.Paragraph;
|
||||
import com.iqser.red.service.redaction.v1.server.classification.model.TextBlock;
|
||||
import com.iqser.red.service.redaction.v1.server.classification.model.UnclassifiedText;
|
||||
import com.iqser.red.service.redaction.v1.server.redaction.model.PdfImage;
|
||||
import com.iqser.red.service.redaction.v1.server.tableextraction.model.AbstractTextContainer;
|
||||
import com.iqser.red.service.redaction.v1.server.tableextraction.model.Cell;
|
||||
import com.iqser.red.service.redaction.v1.server.tableextraction.model.Table;
|
||||
import org.apache.commons.collections4.CollectionUtils;
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import java.util.*;
|
||||
import java.util.stream.Collectors;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
|
||||
@Slf4j
|
||||
@Service
|
||||
public class SectionsBuilderService {
|
||||
|
||||
@ -53,8 +68,7 @@ public class SectionsBuilderService {
|
||||
continue;
|
||||
}
|
||||
|
||||
if (prev != null && current.getClassification().startsWith("H ") && !prev.getClassification()
|
||||
.startsWith("H ") || !document.isHeadlines()) {
|
||||
if (prev != null && current.getClassification().startsWith("H ") && !prev.getClassification().startsWith("H ") || !document.isHeadlines()) {
|
||||
Paragraph chunkBlock = buildTextBlock(chunkWords, lastHeadline);
|
||||
chunkBlock.setHeadline(lastHeadline);
|
||||
if (document.isHeadlines()) {
|
||||
@ -100,17 +114,19 @@ public class SectionsBuilderService {
|
||||
|
||||
public void addImagesToSections(Document document) {
|
||||
|
||||
Map<Integer, SortedSet<Paragraph>> paragraphMap = new HashMap<>();
|
||||
Map<Integer, List<Paragraph>> paragraphMap = new HashMap<>();
|
||||
for (Paragraph paragraph : document.getParagraphs()) {
|
||||
for (AbstractTextContainer container : paragraph.getPageBlocks()) {
|
||||
paragraphMap.computeIfAbsent(container.getPage(), x -> new TreeSet<>()).add(paragraph);
|
||||
|
||||
paragraphMap.computeIfAbsent(container.getPage(), c -> new ArrayList<>()).add(paragraph);
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
if (paragraphMap.isEmpty()) {
|
||||
Paragraph paragraph = new Paragraph();
|
||||
document.getParagraphs().add(paragraph);
|
||||
paragraphMap.computeIfAbsent(1, x -> new TreeSet<>()).add(paragraph);
|
||||
paragraphMap.computeIfAbsent(1, x -> new ArrayList<>()).add(paragraph);
|
||||
}
|
||||
|
||||
// first page is always a paragraph, else we can't process pages 1..N,
|
||||
@ -118,12 +134,12 @@ public class SectionsBuilderService {
|
||||
if (paragraphMap.get(1) == null) {
|
||||
Paragraph paragraph = new Paragraph();
|
||||
document.getParagraphs().add(paragraph);
|
||||
paragraphMap.computeIfAbsent(1, x -> new TreeSet<>()).add(paragraph);
|
||||
paragraphMap.computeIfAbsent(1, x -> new ArrayList<>()).add(paragraph);
|
||||
}
|
||||
|
||||
for (Page page : document.getPages()) {
|
||||
for (PdfImage image : page.getImages()) {
|
||||
SortedSet<Paragraph> paragraphsOnPage = paragraphMap.get(page.getPageNumber());
|
||||
List<Paragraph> paragraphsOnPage = paragraphMap.get(page.getPageNumber());
|
||||
if (paragraphsOnPage == null) {
|
||||
int i = page.getPageNumber();
|
||||
while (paragraphsOnPage == null) {
|
||||
@ -131,27 +147,63 @@ public class SectionsBuilderService {
|
||||
i--;
|
||||
}
|
||||
}
|
||||
|
||||
Float perviousEnd = 0f;
|
||||
for (Paragraph paragraph : paragraphsOnPage) {
|
||||
Float currentEnd = 0f;
|
||||
Float xMin = null;
|
||||
Float yMin = null;
|
||||
Float xMax = null;
|
||||
Float yMax = null;
|
||||
|
||||
for (AbstractTextContainer abs : paragraph.getPageBlocks()) {
|
||||
if (abs.getPage() != page.getPageNumber()) {
|
||||
continue;
|
||||
}
|
||||
if (abs.getMaxY() > currentEnd) {
|
||||
currentEnd = abs.getMaxY();
|
||||
|
||||
if (abs.getMinX() < abs.getMaxX()) {
|
||||
if (xMin == null || abs.getMinX() < xMin) {
|
||||
xMin = abs.getMinX();
|
||||
}
|
||||
if (xMax == null || abs.getMaxX() > xMax) {
|
||||
xMax = abs.getMaxX();
|
||||
}
|
||||
} else {
|
||||
if (xMin == null || abs.getMaxX() < xMin) {
|
||||
xMin = abs.getMaxX();
|
||||
}
|
||||
if (xMax == null || abs.getMinX() > xMax) {
|
||||
xMax = abs.getMinX();
|
||||
}
|
||||
}
|
||||
|
||||
if (abs.getMinY() < abs.getMaxY()) {
|
||||
if (yMin == null || abs.getMinY() < yMin) {
|
||||
yMin = abs.getMinY();
|
||||
}
|
||||
if (yMax == null || abs.getMaxY() > yMax) {
|
||||
yMax = abs.getMaxY();
|
||||
}
|
||||
} else {
|
||||
if (yMin == null || abs.getMaxY() < yMin) {
|
||||
yMin = abs.getMaxY();
|
||||
}
|
||||
if (yMax == null || abs.getMinY() > yMax) {
|
||||
yMax = abs.getMinY();
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
if (image.getPosition().getY() >= perviousEnd && image.getPosition().getY() <= currentEnd) {
|
||||
log.debug("Image position x: {}, y: {}", image.getPosition().getX(), image.getPosition().getY());
|
||||
log.debug("Paragraph position xMin: {}, xMax: {}, yMin: {}, yMax: {}", xMin, xMax, yMin, yMax);
|
||||
|
||||
if (xMin != null && xMax != null && yMin != null && yMax != null && image.getPosition().getX() >= xMin && image.getPosition()
|
||||
.getX() <= xMax && image.getPosition().getY() >= yMin && image.getPosition().getY() <= yMax) {
|
||||
paragraph.getImages().add(image);
|
||||
image.setAppendedToParagraph(true);
|
||||
}
|
||||
perviousEnd = currentEnd;
|
||||
}
|
||||
if (!image.isAppendedToParagraph()) {
|
||||
paragraphsOnPage.first().getImages().add(image);
|
||||
log.debug("Image uses first paragraph");
|
||||
paragraphsOnPage.get(0).getImages().add(image);
|
||||
image.setAppendedToParagraph(true);
|
||||
}
|
||||
}
|
||||
@ -166,9 +218,7 @@ public class SectionsBuilderService {
|
||||
List<Cell> previousTableNonHeaderRow = getRowWithNonHeaderCells(previousTable);
|
||||
List<Cell> tableNonHeaderRow = getRowWithNonHeaderCells(currentTable);
|
||||
// Allow merging of tables if header row is separated from first logical non-header row
|
||||
if (previousTableNonHeaderRow.isEmpty() && previousTable.getRowCount() == 1 && previousTable.getRows()
|
||||
.get(0)
|
||||
.size() == tableNonHeaderRow.size()) {
|
||||
if (previousTableNonHeaderRow.isEmpty() && previousTable.getRowCount() == 1 && previousTable.getRows().get(0).size() == tableNonHeaderRow.size()) {
|
||||
previousTableNonHeaderRow = previousTable.getRows().get(0).stream().map(cell -> {
|
||||
Cell fakeCell = new Cell(cell.getPoints()[0], cell.getPoints()[2]);
|
||||
fakeCell.setHeaderCells(Collections.singletonList(cell));
|
||||
@ -178,8 +228,7 @@ public class SectionsBuilderService {
|
||||
if (previousTableNonHeaderRow.size() == tableNonHeaderRow.size()) {
|
||||
for (int i = currentTable.getRowCount() - 1; i >= 0; i--) { // Non header rows are most likely at bottom of table
|
||||
List<Cell> row = currentTable.getRows().get(i);
|
||||
if (row.size() == tableNonHeaderRow.size() && row.stream()
|
||||
.allMatch(cell -> cell.getHeaderCells().isEmpty())) {
|
||||
if (row.size() == tableNonHeaderRow.size() && row.stream().allMatch(cell -> cell.getHeaderCells().isEmpty())) {
|
||||
for (int j = 0; j < row.size(); j++) {
|
||||
row.get(j).setHeaderCells(previousTableNonHeaderRow.get(j).getHeaderCells());
|
||||
}
|
||||
@ -229,24 +278,20 @@ public class SectionsBuilderService {
|
||||
TextBlock wordBlock = (TextBlock) container;
|
||||
|
||||
if (textBlock == null) {
|
||||
textBlock = new TextBlock(wordBlock.getMinX(), wordBlock.getMaxX(), wordBlock.getMinY(), wordBlock.getMaxY(), wordBlock
|
||||
.getSequences(), wordBlock.getRotation());
|
||||
textBlock = new TextBlock(wordBlock.getMinX(), wordBlock.getMaxX(), wordBlock.getMinY(), wordBlock.getMaxY(), wordBlock.getSequences(), wordBlock.getRotation());
|
||||
textBlock.setPage(wordBlock.getPage());
|
||||
} else if (splitByTable) {
|
||||
textBlock = new TextBlock(wordBlock.getMinX(), wordBlock.getMaxX(), wordBlock.getMinY(), wordBlock.getMaxY(), wordBlock
|
||||
.getSequences(), wordBlock.getRotation());
|
||||
textBlock = new TextBlock(wordBlock.getMinX(), wordBlock.getMaxX(), wordBlock.getMinY(), wordBlock.getMaxY(), wordBlock.getSequences(), wordBlock.getRotation());
|
||||
textBlock.setPage(wordBlock.getPage());
|
||||
alreadyAdded = false;
|
||||
} else if (pageBefore != -1 && wordBlock.getPage() != pageBefore) {
|
||||
textBlock.setPage(pageBefore);
|
||||
paragraph.getPageBlocks().add(textBlock);
|
||||
textBlock = new TextBlock(wordBlock.getMinX(), wordBlock.getMaxX(), wordBlock.getMinY(), wordBlock.getMaxY(), wordBlock
|
||||
.getSequences(), wordBlock.getRotation());
|
||||
textBlock = new TextBlock(wordBlock.getMinX(), wordBlock.getMaxX(), wordBlock.getMinY(), wordBlock.getMaxY(), wordBlock.getSequences(), wordBlock.getRotation());
|
||||
textBlock.setPage(wordBlock.getPage());
|
||||
} else {
|
||||
TextBlock spatialEntity = textBlock.union(wordBlock);
|
||||
textBlock.resize(spatialEntity.getMinX(), spatialEntity.getMinY(), spatialEntity.getWidth(), spatialEntity
|
||||
.getHeight());
|
||||
textBlock.resize(spatialEntity.getMinX(), spatialEntity.getMinY(), spatialEntity.getWidth(), spatialEntity.getHeight());
|
||||
}
|
||||
pageBefore = wordBlock.getPage();
|
||||
splitByTable = false;
|
||||
@ -268,11 +313,7 @@ public class SectionsBuilderService {
|
||||
|
||||
private boolean hasInvalidHeaderInformation(Table table) {
|
||||
|
||||
return table.getRows()
|
||||
.stream()
|
||||
.flatMap(row -> row.stream().filter(cell -> CollectionUtils.isNotEmpty(cell.getHeaderCells())))
|
||||
.findAny()
|
||||
.isEmpty();
|
||||
return table.getRows().stream().flatMap(row -> row.stream().filter(cell -> CollectionUtils.isNotEmpty(cell.getHeaderCells()))).findAny().isEmpty();
|
||||
|
||||
}
|
||||
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user