RED-6009: Document Tree Structure
*fix pmd violations
This commit is contained in:
parent
f9d258e1fc
commit
d5635e9660
@ -12,6 +12,6 @@
|
||||
</parent>
|
||||
|
||||
<artifactId>layoutparser-service-image</artifactId>
|
||||
<version>1.0.0</version>
|
||||
|
||||
|
||||
</project>
|
||||
|
||||
@ -1,6 +1,5 @@
|
||||
package com.knecon.fforesight.service.layoutparser.internal.api.data;
|
||||
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.stream.Stream;
|
||||
@ -11,6 +10,7 @@ import lombok.AccessLevel;
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Builder;
|
||||
import lombok.Data;
|
||||
import lombok.Getter;
|
||||
import lombok.NoArgsConstructor;
|
||||
import lombok.experimental.FieldDefaults;
|
||||
|
||||
@ -31,7 +31,7 @@ public class TableOfContentsData {
|
||||
}
|
||||
EntryData entry = root.subEntries.get(tocId.get(0));
|
||||
for (int id : tocId.subList(1, tocId.size())) {
|
||||
entry = entry.subEntries().get(id);
|
||||
entry = entry.subEntries.get(id);
|
||||
}
|
||||
return entry;
|
||||
}
|
||||
@ -43,12 +43,6 @@ public class TableOfContentsData {
|
||||
}
|
||||
|
||||
|
||||
private static List<Integer> getIds(String idsAsString) {
|
||||
|
||||
return Arrays.stream(idsAsString.split("\\.")).map(Integer::valueOf).toList();
|
||||
}
|
||||
|
||||
|
||||
public String toString() {
|
||||
|
||||
return String.join("\n", streamAllEntries().map(EntryData::toString).toList());
|
||||
@ -57,12 +51,23 @@ public class TableOfContentsData {
|
||||
|
||||
private static Stream<EntryData> flatten(EntryData entry) {
|
||||
|
||||
return Stream.concat(Stream.of(entry), entry.subEntries().stream().flatMap(TableOfContentsData::flatten));
|
||||
return Stream.concat(Stream.of(entry), entry.subEntries.stream().flatMap(TableOfContentsData::flatten));
|
||||
}
|
||||
|
||||
|
||||
@Builder
|
||||
public record EntryData(NodeType type, int[] tocId, Long[] atomicBlocks, Long[] pages, Map<String, String> properties, List<EntryData> subEntries) {
|
||||
@Getter
|
||||
@AllArgsConstructor
|
||||
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
|
||||
public static class EntryData {
|
||||
|
||||
NodeType type;
|
||||
int[] tocId;
|
||||
Long[] atomicBlocks;
|
||||
Long[] pages;
|
||||
Map<String, String> properties;
|
||||
List<EntryData> subEntries;
|
||||
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
@ -77,7 +77,7 @@ public class DocumentGraph implements SemanticNode {
|
||||
|
||||
private Stream<SemanticNode> streamAllNodes() {
|
||||
|
||||
return tableOfContents.streamAllEntriesInOrder().map(TableOfContents.Entry::node);
|
||||
return tableOfContents.streamAllEntriesInOrder().map(TableOfContents.Entry::getNode);
|
||||
}
|
||||
|
||||
|
||||
@ -14,8 +14,12 @@ import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.Seman
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.graph.textblock.TextBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.graph.textblock.TextBlockCollector;
|
||||
|
||||
import lombok.AccessLevel;
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Builder;
|
||||
import lombok.Data;
|
||||
import lombok.Getter;
|
||||
import lombok.experimental.FieldDefaults;
|
||||
|
||||
@Data
|
||||
public class TableOfContents {
|
||||
@ -31,7 +35,7 @@ public class TableOfContents {
|
||||
|
||||
public TextBlock buildTextBlock() {
|
||||
|
||||
return streamAllEntriesInOrder().map(Entry::node).filter(SemanticNode::isTerminal).map(SemanticNode::getTerminalTextBlock).collect(new TextBlockCollector());
|
||||
return streamAllEntriesInOrder().map(Entry::getNode).filter(SemanticNode::isTerminal).map(SemanticNode::getTerminalTextBlock).collect(new TextBlockCollector());
|
||||
}
|
||||
|
||||
|
||||
@ -49,8 +53,8 @@ public class TableOfContents {
|
||||
|
||||
Entry parent = getEntryById(parentId);
|
||||
List<Integer> newId = new LinkedList<>(parentId);
|
||||
newId.add(parent.children().size());
|
||||
parent.children().add(Entry.builder().tocId(newId).node(node).type(nodeType).children(new LinkedList<>()).build());
|
||||
newId.add(parent.children.size());
|
||||
parent.children.add(Entry.builder().tocId(newId).node(node).type(nodeType).children(new LinkedList<>()).build());
|
||||
|
||||
return newId;
|
||||
}
|
||||
@ -66,7 +70,7 @@ public class TableOfContents {
|
||||
if (id >= entry.children.size() || 0 > id) {
|
||||
return false;
|
||||
}
|
||||
entry = entry.children().get(id);
|
||||
entry = entry.children.get(id);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
@ -86,7 +90,7 @@ public class TableOfContents {
|
||||
|
||||
public Stream<SemanticNode> streamChildrenNodes(List<Integer> tocId) {
|
||||
|
||||
return getEntryById(tocId).children().stream().map(Entry::node);
|
||||
return getEntryById(tocId).children.stream().map(Entry::getNode);
|
||||
}
|
||||
|
||||
|
||||
@ -109,7 +113,7 @@ public class TableOfContents {
|
||||
}
|
||||
Entry entry = root.children.get(tocId.get(0));
|
||||
for (int id : tocId.subList(1, tocId.size())) {
|
||||
entry = entry.children().get(id);
|
||||
entry = entry.children.get(id);
|
||||
}
|
||||
return entry;
|
||||
}
|
||||
@ -148,17 +152,26 @@ public class TableOfContents {
|
||||
|
||||
private static Stream<Entry> flatten(Entry entry) {
|
||||
|
||||
return Stream.concat(Stream.of(entry), entry.children().stream().flatMap(TableOfContents::flatten));
|
||||
return Stream.concat(Stream.of(entry), entry.children.stream().flatMap(TableOfContents::flatten));
|
||||
}
|
||||
|
||||
|
||||
@Builder
|
||||
public record Entry(List<Integer> tocId, NodeType type, SemanticNode node, List<Entry> children) {
|
||||
@Getter
|
||||
@AllArgsConstructor
|
||||
@FieldDefaults(level = AccessLevel.PRIVATE, makeFinal = true)
|
||||
public static class Entry {
|
||||
|
||||
List<Integer> tocId;
|
||||
NodeType type;
|
||||
SemanticNode node;
|
||||
List<Entry> children;
|
||||
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
|
||||
return node().toString();
|
||||
return node.toString();
|
||||
}
|
||||
|
||||
|
||||
@ -168,6 +181,13 @@ public class TableOfContents {
|
||||
return Hashing.murmur3_32_fixed().hashString(toString(), StandardCharsets.UTF_8).hashCode();
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public boolean equals(Object o) {
|
||||
|
||||
return o instanceof Entry && o.hashCode() == this.hashCode();
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
@ -28,7 +28,7 @@ public interface EntityNode {
|
||||
|
||||
/**
|
||||
* The deepest fully containing node represents the node which is the deepest node in the document tree structure,
|
||||
* whose boundary also fully contains the boundary of this entity
|
||||
* whose boundary also fully contains the boundary of this entity.
|
||||
*
|
||||
* @return the deepest fully containing node
|
||||
*/
|
||||
@ -62,7 +62,7 @@ public interface EntityNode {
|
||||
|
||||
|
||||
/**
|
||||
* removes all occurrences of this node in the graph and resets all graph specific fields
|
||||
* removes all occurrences of this node in the graph and resets all graph specific fields.
|
||||
*/
|
||||
default void removeFromGraph() {
|
||||
|
||||
@ -36,4 +36,10 @@ public class EntityPosition {
|
||||
return Hashing.murmur3_128().hashString(sb.toString(), StandardCharsets.UTF_8).hashCode();
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean equals(Object o) {
|
||||
|
||||
return o instanceof EntityPosition && o.hashCode() == this.hashCode();
|
||||
}
|
||||
|
||||
}
|
||||
@ -34,10 +34,10 @@ public class ImageNode implements SemanticNode {
|
||||
boolean transparency;
|
||||
Rectangle2D position;
|
||||
|
||||
@Builder.Default
|
||||
boolean redaction = false;
|
||||
@Builder.Default
|
||||
boolean ignored = false;
|
||||
|
||||
boolean redaction;
|
||||
boolean ignored;
|
||||
|
||||
@Builder.Default
|
||||
String redactionReason = "";
|
||||
@Builder.Default
|
||||
@ -55,7 +55,6 @@ public class ImageNode implements SemanticNode {
|
||||
@EqualsAndHashCode.Exclude
|
||||
Set<EntityNode> entities = new HashSet<>();
|
||||
|
||||
|
||||
@Override
|
||||
public TextBlock buildTextBlock() {
|
||||
|
||||
@ -63,4 +63,9 @@ public class PageNode {
|
||||
return number;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean equals(Object o) {
|
||||
|
||||
return o instanceof PageNode && o.hashCode() == this.hashCode();
|
||||
}
|
||||
}
|
||||
@ -37,7 +37,7 @@ public interface SemanticNode {
|
||||
|
||||
|
||||
/**
|
||||
* Each AtomicTextBlock is assigned a page, so to get the pages this node appears on, it collects the PageNodes from each AtomicTextBlock belonging to this node's ClassificationTextBlock
|
||||
* Each AtomicTextBlock is assigned a page, so to get the pages this node appears on, it collects the PageNodes from each AtomicTextBlock belonging to this node's ClassificationTextBlock.
|
||||
*
|
||||
* @return Set of PageNodes this node appears on.
|
||||
*/
|
||||
@ -54,7 +54,7 @@ public interface SemanticNode {
|
||||
|
||||
|
||||
/**
|
||||
* The id is a List of Integers uniquely identifying this node in the TableOfContents
|
||||
* The id is a List of Integers uniquely identifying this node in the TableOfContents.
|
||||
*
|
||||
* @return the TableOfContents ID
|
||||
*/
|
||||
@ -62,7 +62,7 @@ public interface SemanticNode {
|
||||
|
||||
|
||||
/**
|
||||
* This should only be used during graph construction
|
||||
* This should only be used during graph construction.
|
||||
*
|
||||
* @param tocId List of Integers
|
||||
*/
|
||||
@ -96,7 +96,7 @@ public interface SemanticNode {
|
||||
*/
|
||||
default SemanticNode getParent() {
|
||||
|
||||
return getTableOfContents().getParentEntryById(getTocId()).node();
|
||||
return getTableOfContents().getParentEntryById(getTocId()).getNode();
|
||||
}
|
||||
|
||||
|
||||
@ -199,7 +199,7 @@ public interface SemanticNode {
|
||||
|
||||
|
||||
/**
|
||||
* Streams all children located directly underneath this node in the TableOfContents
|
||||
* Streams all children located directly underneath this node in the TableOfContents.
|
||||
*
|
||||
* @return Stream of all children
|
||||
*/
|
||||
@ -216,7 +216,7 @@ public interface SemanticNode {
|
||||
*/
|
||||
default Stream<SemanticNode> streamAllSubNodes() {
|
||||
|
||||
return getTableOfContents().streamAllSubEntriesInOrder(getTocId()).map(TableOfContents.Entry::node);
|
||||
return getTableOfContents().streamAllSubEntriesInOrder(getTocId()).map(TableOfContents.Entry::getNode);
|
||||
}
|
||||
|
||||
|
||||
@ -247,7 +247,7 @@ public interface SemanticNode {
|
||||
|
||||
|
||||
/**
|
||||
* TODO this does not yet work for sections spanning multiple columns
|
||||
* TODO this does not yet work for sections spanning multiple columns.
|
||||
*
|
||||
* @param bBoxPerPage initial empty BoundingBox
|
||||
* @return The union of the BoundingBoxes of all children
|
||||
@ -54,7 +54,7 @@ public class ConcatenatedTextBlock implements TextBlock {
|
||||
|
||||
private AtomicTextBlock getAtomicTextBlockByStringIndex(int stringIdx) {
|
||||
|
||||
return atomicTextBlocks.stream().filter(textBlock -> (textBlock.getBoundary().contains(stringIdx))).findAny().orElseThrow(IndexOutOfBoundsException::new);
|
||||
return atomicTextBlocks.stream().filter(textBlock -> textBlock.getBoundary().contains(stringIdx)).findAny().orElseThrow(IndexOutOfBoundsException::new);
|
||||
}
|
||||
|
||||
|
||||
@ -59,25 +59,25 @@ public class DocumentDataMapper {
|
||||
|
||||
Long[] atomicTextBlocks;
|
||||
|
||||
if (entry.node().isTerminal()) {
|
||||
atomicTextBlocks = toAtomicTextBlockIds(entry.node().getTerminalTextBlock());
|
||||
if (entry.getNode().isTerminal()) {
|
||||
atomicTextBlocks = toAtomicTextBlockIds(entry.getNode().getTerminalTextBlock());
|
||||
} else {
|
||||
atomicTextBlocks = new Long[]{};
|
||||
}
|
||||
|
||||
Map<String, String> properties = switch (entry.type()) {
|
||||
case TABLE -> PropertiesMapper.buildTableProperties((TableNode) entry.node());
|
||||
case TABLE_CELL -> PropertiesMapper.buildTableCellProperties((TableCellNode) entry.node());
|
||||
case IMAGE -> PropertiesMapper.buildImageProperties((ImageNode) entry.node());
|
||||
Map<String, String> properties = switch (entry.getType()) {
|
||||
case TABLE -> PropertiesMapper.buildTableProperties((TableNode) entry.getNode());
|
||||
case TABLE_CELL -> PropertiesMapper.buildTableCellProperties((TableCellNode) entry.getNode());
|
||||
case IMAGE -> PropertiesMapper.buildImageProperties((ImageNode) entry.getNode());
|
||||
default -> new HashMap<>();
|
||||
};
|
||||
|
||||
return TableOfContentsData.EntryData.builder()
|
||||
.tocId(toPrimitiveIntArray(entry.tocId()))
|
||||
.subEntries(entry.children().stream().map(DocumentDataMapper::toEntryData).toList())
|
||||
.type(entry.type())
|
||||
.tocId(toPrimitiveIntArray(entry.getTocId()))
|
||||
.subEntries(entry.getChildren().stream().map(DocumentDataMapper::toEntryData).toList())
|
||||
.type(entry.getType())
|
||||
.atomicBlocks(atomicTextBlocks)
|
||||
.pages(entry.node().getPages().stream().map(PageNode::getNumber).map(Integer::longValue).toArray(Long[]::new))
|
||||
.pages(entry.getNode().getPages().stream().map(PageNode::getNumber).map(Integer::longValue).toArray(Long[]::new))
|
||||
.properties(properties)
|
||||
.build();
|
||||
}
|
||||
@ -52,7 +52,7 @@ public class DocumentGraphMapper {
|
||||
|
||||
context.pages.addAll(Arrays.stream(documentData.getPages()).map(DocumentGraphMapper::buildPage).toList());
|
||||
|
||||
context.tableOfContents.getRoot().children().addAll(buildEntries(documentData.getTableOfContents().getRoot().subEntries(), context));
|
||||
context.tableOfContents.getRoot().getChildren().addAll(buildEntries(documentData.getTableOfContents().getRoot().getSubEntries(), context));
|
||||
|
||||
documentGraph.setTableOfContents(context.tableOfContents);
|
||||
documentGraph.setPages(new HashSet<>(context.pages));
|
||||
@ -70,35 +70,35 @@ public class DocumentGraphMapper {
|
||||
for (TableOfContentsData.EntryData entryData : entries) {
|
||||
|
||||
boolean terminal = isTerminal(entryData);
|
||||
List<PageNode> pages = Arrays.stream(entryData.pages()).map(pageNumber -> getPage(pageNumber, context)).toList();
|
||||
List<PageNode> pages = Arrays.stream(entryData.getPages()).map(pageNumber -> getPage(pageNumber, context)).toList();
|
||||
|
||||
SemanticNode node = switch (entryData.type()) {
|
||||
SemanticNode node = switch (entryData.getType()) {
|
||||
case SECTION -> buildSection(context);
|
||||
case PARAGRAPH -> buildParagraph(context, terminal);
|
||||
case HEADLINE -> buildHeadline(context, terminal);
|
||||
case HEADER -> buildHeader(context, terminal);
|
||||
case FOOTER -> buildFooter(context, terminal);
|
||||
case TABLE -> buildTable(context, entryData.properties());
|
||||
case TABLE_CELL -> buildTableCell(context, entryData.properties(), terminal);
|
||||
case IMAGE -> buildImage(context, entryData.properties());
|
||||
default -> throw new UnsupportedOperationException("Not yet implemented for type " + entryData.type());
|
||||
case TABLE -> buildTable(context, entryData.getProperties());
|
||||
case TABLE_CELL -> buildTableCell(context, entryData.getProperties(), terminal);
|
||||
case IMAGE -> buildImage(context, entryData.getProperties());
|
||||
default -> throw new UnsupportedOperationException("Not yet implemented for type " + entryData.getType());
|
||||
};
|
||||
|
||||
if (node.isTerminal()) {
|
||||
TextBlock textBlock = toTextBlock(entryData.atomicBlocks(), context, node);
|
||||
TextBlock textBlock = toTextBlock(entryData.getAtomicBlocks(), context, node);
|
||||
node.setTerminalTextBlock(textBlock);
|
||||
}
|
||||
List<Integer> tocId = Arrays.stream(entryData.tocId()).boxed().toList();
|
||||
List<Integer> tocId = Arrays.stream(entryData.getTocId()).boxed().toList();
|
||||
node.setTocId(tocId);
|
||||
|
||||
if (entryData.type() == HEADER) {
|
||||
if (entryData.getType() == HEADER) {
|
||||
pages.forEach(page -> page.setHeader((HeaderNode) node));
|
||||
} else if (entryData.type() == FOOTER) {
|
||||
} else if (entryData.getType() == FOOTER) {
|
||||
pages.forEach(page -> page.setFooter((FooterNode) node));
|
||||
} else {
|
||||
pages.forEach(page -> page.getMainBody().add(node));
|
||||
}
|
||||
newEntries.add(TableOfContents.Entry.builder().tocId(tocId).type(entryData.type()).children(buildEntries(entryData.subEntries(), context)).node(node).build());
|
||||
newEntries.add(TableOfContents.Entry.builder().tocId(tocId).type(entryData.getType()).children(buildEntries(entryData.getSubEntries(), context)).node(node).build());
|
||||
}
|
||||
return newEntries;
|
||||
}
|
||||
@ -112,7 +112,7 @@ public class DocumentGraphMapper {
|
||||
|
||||
private static boolean isTerminal(TableOfContentsData.EntryData entryData) {
|
||||
|
||||
return entryData.atomicBlocks().length > 0;
|
||||
return entryData.getAtomicBlocks().length > 0;
|
||||
}
|
||||
|
||||
|
||||
@ -4,7 +4,6 @@ import java.util.Collections;
|
||||
import java.util.NoSuchElementException;
|
||||
import java.util.Set;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.graph.Boundary;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.graph.TableOfContents;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.graph.entity.EntityNode;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.graph.nodes.PageNode;
|
||||
@ -54,10 +53,4 @@ public class EntityInsertionService {
|
||||
entity.getIntersectingNodes().forEach(node -> node.getEntities().add(entity));
|
||||
}
|
||||
|
||||
|
||||
private static Boundary toLineAfterBoundary(TextBlock textBlock, Boundary boundary) {
|
||||
|
||||
return new Boundary(boundary.end(), textBlock.getNextLinebreak(boundary.end()));
|
||||
}
|
||||
|
||||
}
|
||||
@ -2,8 +2,6 @@ package com.knecon.fforesight.service.layoutparser.processor.classification.dto.
|
||||
|
||||
import java.util.List;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.table.Ruling;
|
||||
|
||||
import lombok.Builder;
|
||||
import lombok.Data;
|
||||
|
||||
|
||||
@ -1,132 +0,0 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.factory;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Optional;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.AbstractTextContainer;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.ClassificationDocument;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.ClassificationSection;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.image.ClassifiedImage;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.table.Table;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.table.TableCell;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.dto.text.ClassificationTextBlock;
|
||||
|
||||
@Service
|
||||
public class ImageSortService {
|
||||
|
||||
public SortedImages sortImagesIntoStructure(ClassificationDocument document) {
|
||||
|
||||
SortedImages sortedImages = new SortedImages(new HashMap<>(), new HashMap<>(), new HashMap<>(), new HashMap<>(), new HashMap<>());
|
||||
|
||||
Map<Integer, List<ClassifiedImage>> imagesByPage = document.getSections()
|
||||
.stream()
|
||||
.flatMap(section -> section.getImages().stream())
|
||||
.distinct()
|
||||
.collect(Collectors.groupingBy(ClassifiedImage::getPage));
|
||||
|
||||
for (int pageNumber : imagesByPage.keySet()) {
|
||||
List<AbstractTextContainer> textContainersOnPage = document.getSections()
|
||||
.stream()
|
||||
.flatMap(section -> section.getPageBlocks().stream())
|
||||
.filter(abstractTextContainer -> abstractTextContainer.getPage() == pageNumber)
|
||||
.toList();
|
||||
|
||||
List<ClassificationSection> sectionsOnPage = document.getSections()
|
||||
.stream()
|
||||
.filter(section -> section.getPageBlocks().stream().anyMatch(block -> block.getPage() == pageNumber))
|
||||
.toList();
|
||||
|
||||
for (ClassifiedImage image : imagesByPage.get(pageNumber)) {
|
||||
sortImage(textContainersOnPage, sectionsOnPage, image, sortedImages);
|
||||
}
|
||||
}
|
||||
return sortedImages;
|
||||
}
|
||||
|
||||
|
||||
private void sortImage(List<AbstractTextContainer> textContainersOnPage, List<ClassificationSection> sectionsOnPage, ClassifiedImage image, SortedImages sortedImages) {
|
||||
|
||||
Optional<AbstractTextContainer> containingTextContainer = getContainingTextContainer(image, textContainersOnPage);
|
||||
Optional<ClassificationSection> sectionContainingTextContainer = getContainingSection(image, sectionsOnPage);
|
||||
List<AbstractTextContainer> containedTextContainers = getContainedTextContainers(image, textContainersOnPage);
|
||||
List<ClassificationSection> containedSections = getContainedSections(image, sectionsOnPage);
|
||||
if (containingTextContainer.isPresent()) {
|
||||
if (sortImageIntoTextContainerOrCell(image, sortedImages, containingTextContainer.get())) {
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
private static boolean sortImageIntoTextContainerOrCell(ClassifiedImage image, SortedImages sortedImages, AbstractTextContainer containingTextContainer) {
|
||||
|
||||
if (containingTextContainer instanceof ClassificationTextBlock) {
|
||||
sortedImages.containedInTextContainer().computeIfAbsent(containingTextContainer, sortedImage -> new ArrayList<>()).add(image);
|
||||
return true;
|
||||
}
|
||||
if (containingTextContainer instanceof Table) {
|
||||
Optional<TableCell> containingCell = getContainingCell((Table) containingTextContainer, image);
|
||||
if (containingCell.isPresent()) {
|
||||
sortedImages.containedInCell().computeIfAbsent(containingCell.get(), sortedImage -> new ArrayList<>()).add(image);
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
private static Optional<TableCell> getContainingCell(Table table, ClassifiedImage image) {
|
||||
|
||||
return table.getRows().stream().flatMap(List::stream).filter(cell -> cell.contains(image.getPosition())).findFirst();
|
||||
}
|
||||
|
||||
|
||||
private List<ClassificationSection> getContainedSections(ClassifiedImage image, List<ClassificationSection> sectionsOnPage) {
|
||||
|
||||
return sectionsOnPage.stream()
|
||||
.filter(section -> image.getPosition().contains(RectangleTransformations.bBoxUnionAbstractTextContainer(section.getPageBlocks()
|
||||
.stream()
|
||||
.filter(block -> block.getPage() == image.getPage())
|
||||
.toList())))
|
||||
.toList();
|
||||
}
|
||||
|
||||
|
||||
private List<AbstractTextContainer> getContainedTextContainers(ClassifiedImage image, List<AbstractTextContainer> textContainersOnPage) {
|
||||
|
||||
return textContainersOnPage.stream().filter(textContainer -> image.getPosition().contains(RectangleTransformations.toRectangle2D(textContainer))).toList();
|
||||
}
|
||||
|
||||
|
||||
private Optional<ClassificationSection> getContainingSection(ClassifiedImage image, List<ClassificationSection> sectionsOnPage) {
|
||||
|
||||
return sectionsOnPage.stream()//
|
||||
.filter(section -> //
|
||||
RectangleTransformations.bBoxUnionAbstractTextContainer(section.getPageBlocks().stream().filter(block -> block.getPage() == image.getPage()).toList())//
|
||||
.contains(image.getPosition())).findFirst();
|
||||
}
|
||||
|
||||
|
||||
private Optional<AbstractTextContainer> getContainingTextContainer(ClassifiedImage image, List<AbstractTextContainer> textContainersOnPage) {
|
||||
|
||||
return textContainersOnPage.stream().filter(textContainer -> RectangleTransformations.toRectangle2D(textContainer).contains(image.getPosition())).findFirst();
|
||||
}
|
||||
|
||||
|
||||
public record SortedImages(
|
||||
Map<TableCell, List<ClassifiedImage>> containedInCell,
|
||||
Map<AbstractTextContainer, List<ClassifiedImage>> containedInTextContainer,
|
||||
Map<ClassificationSection, List<ClassifiedImage>> containedInSection,
|
||||
Map<ClassifiedImage, List<AbstractTextContainer>> containedByImage,
|
||||
Map<ClassifiedImage, List<ClassificationSection>> sectionContainedByImage) {
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
@ -6,6 +6,7 @@ import java.security.SecureRandom;
|
||||
import java.security.spec.KeySpec;
|
||||
import java.util.Base64;
|
||||
|
||||
import javax.annotation.PostConstruct;
|
||||
import javax.crypto.Cipher;
|
||||
import javax.crypto.SecretKey;
|
||||
import javax.crypto.SecretKeyFactory;
|
||||
@ -16,7 +17,6 @@ import javax.crypto.spec.SecretKeySpec;
|
||||
import org.springframework.beans.factory.annotation.Value;
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import jakarta.annotation.PostConstruct;
|
||||
import lombok.SneakyThrows;
|
||||
|
||||
@Service
|
||||
|
||||
@ -65,18 +65,54 @@
|
||||
|
||||
<build>
|
||||
<plugins>
|
||||
|
||||
<plugin>
|
||||
<groupId>org.apache.maven.plugins</groupId>
|
||||
<artifactId>maven-compiler-plugin</artifactId>
|
||||
<configuration>
|
||||
<annotationProcessors>
|
||||
<annotationProcessor>lombok.launch.AnnotationProcessorHider$AnnotationProcessor</annotationProcessor>
|
||||
<annotationProcessor>com.dslplatform.json.processor.CompiledJsonAnnotationProcessor</annotationProcessor>
|
||||
</annotationProcessors>
|
||||
</configuration>
|
||||
</plugin>
|
||||
|
||||
|
||||
<plugin>
|
||||
<!-- generate git.properties for exposure in /info -->
|
||||
<groupId>pl.project13.maven</groupId>
|
||||
<artifactId>git-commit-id-plugin</artifactId>
|
||||
<executions>
|
||||
<execution>
|
||||
<goals>
|
||||
<goal>revision</goal>
|
||||
</goals>
|
||||
<configuration>
|
||||
<generateGitPropertiesFile>true</generateGitPropertiesFile>
|
||||
<gitDescribe>
|
||||
<tags>true</tags>
|
||||
</gitDescribe>
|
||||
</configuration>
|
||||
</execution>
|
||||
</executions>
|
||||
</plugin>
|
||||
<plugin>
|
||||
<!-- repackages the generated jar into a runnable fat-jar and makes it
|
||||
executable -->
|
||||
<groupId>org.springframework.boot</groupId>
|
||||
<artifactId>spring-boot-maven-plugin</artifactId>
|
||||
<configuration>
|
||||
<excludes>
|
||||
<exclude>
|
||||
<groupId>org.projectlombok</groupId>
|
||||
<artifactId>lombok</artifactId>
|
||||
</exclude>
|
||||
</excludes>
|
||||
</configuration>
|
||||
<executions>
|
||||
<execution>
|
||||
<goals>
|
||||
<goal>repackage</goal>
|
||||
</goals>
|
||||
<configuration>
|
||||
<executable>true</executable>
|
||||
</configuration>
|
||||
</execution>
|
||||
</executions>
|
||||
</plugin>
|
||||
</plugins>
|
||||
</build>
|
||||
|
||||
</project>
|
||||
|
||||
@ -13,9 +13,10 @@ import com.knecon.fforesight.service.layoutparser.processor.LayoutparserServiceP
|
||||
import com.knecon.fforesight.service.layoutparser.processor.multitenancy.AsyncConfig;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.multitenancy.MultiTenancyMessagingConfiguration;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.multitenancy.MultiTenancyWebConfiguration;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.multitenancy.TenantsClient;
|
||||
|
||||
@Import({MultiTenancyWebConfiguration.class, AsyncConfig.class, MultiTenancyMessagingConfiguration.class, MetricsConfiguration.class, LayoutparserServiceProcessorConfiguration.class, StorageAutoConfiguration.class})
|
||||
@EnableFeignClients
|
||||
@EnableFeignClients(basePackageClasses = TenantsClient.class)
|
||||
@SpringBootApplication(exclude = {SecurityAutoConfiguration.class, ManagementWebSecurityAutoConfiguration.class})
|
||||
public class Application {
|
||||
|
||||
@ -121,8 +121,8 @@ public class DocumentGraphEntityInsertionTest extends BuildDocumentGraphTest {
|
||||
DocumentGraph documentGraph = buildGraph("files/crafted document");
|
||||
TableNode table = (TableNode) documentGraph.getTableOfContents()//
|
||||
.streamAllEntriesInOrder()//
|
||||
.filter(entry -> entry.type().equals(NodeType.TABLE))//
|
||||
.map(TableOfContents.Entry::node)//
|
||||
.filter(entry -> entry.getType().equals(NodeType.TABLE))//
|
||||
.map(TableOfContents.Entry::getNode)//
|
||||
.findFirst().orElseThrow();
|
||||
assertEquals(5, table.getNumberOfCols());
|
||||
assertEquals(4, table.getNumberOfRows());
|
||||
@ -151,9 +151,9 @@ public class DocumentGraphEntityInsertionTest extends BuildDocumentGraphTest {
|
||||
DocumentGraph documentGraph = buildGraph("files/S-Metolachlor_RAR_01_Volume_1_2018-09-06");
|
||||
TableNode table = (TableNode) documentGraph.getTableOfContents()
|
||||
.streamAllEntriesInOrder()
|
||||
.filter(entry -> entry.node().getPages().stream().anyMatch(page -> page.getNumber() == 22))
|
||||
.filter(entry -> entry.type().equals(NodeType.TABLE))
|
||||
.map(TableOfContents.Entry::node)
|
||||
.filter(entry -> entry.getNode().getPages().stream().anyMatch(page -> page.getNumber() == 22))
|
||||
.filter(entry -> entry.getType().equals(NodeType.TABLE))
|
||||
.map(TableOfContents.Entry::getNode)
|
||||
.findFirst()
|
||||
.orElseThrow();
|
||||
assertEquals(5, table.getNumberOfCols());
|
||||
|
||||
@ -138,7 +138,7 @@ public class PdfDraw {
|
||||
|
||||
private static Options buildStandardOptionsForNodes(TableOfContents.Entry entry) {
|
||||
|
||||
return Options.builder().stroke(true).strokeColor(switch (entry.type()) {
|
||||
return Options.builder().stroke(true).strokeColor(switch (entry.getType()) {
|
||||
case DOCUMENT -> Color.LIGHT_GRAY;
|
||||
case HEADER, FOOTER -> Color.GREEN;
|
||||
case PARAGRAPH -> Color.BLUE;
|
||||
@ -153,20 +153,20 @@ public class PdfDraw {
|
||||
|
||||
private static void drawBBoxAndLabelAndNumberOnPage(PDDocument document, TableOfContents.Entry entry, Options options) {
|
||||
|
||||
Map<PageNode, Rectangle2D> rectanglesPerPage = entry.node().getBBox();
|
||||
Map<PageNode, Rectangle2D> rectanglesPerPage = entry.getNode().getBBox();
|
||||
rectanglesPerPage.forEach((page, rectangle2D) -> {
|
||||
if (entry.type() == NodeType.SECTION) {
|
||||
if (entry.getType() == NodeType.SECTION) {
|
||||
rectangle2D = RectangleTransformations.pad(rectangle2D, 10, 10);
|
||||
}
|
||||
drawRectangle2DList(document, page.getNumber(), List.of(rectangle2D), options);
|
||||
drawText(buildString(entry), document, new Point2D.Double(rectangle2D.getMinX(), rectangle2D.getMaxY() + 2), page.getNumber(), options, entry.type() == NodeType.TABLE_CELL);
|
||||
drawText(buildString(entry), document, new Point2D.Double(rectangle2D.getMinX(), rectangle2D.getMaxY() + 2), page.getNumber(), options, entry.getType() == NodeType.TABLE_CELL);
|
||||
});
|
||||
}
|
||||
|
||||
|
||||
private static String buildString(TableOfContents.Entry entry) {
|
||||
|
||||
return entry.node().getNumberOnPage() + ": " + entry.tocId() + ": " + entry.type().toString();
|
||||
return entry.getNode().getNumberOnPage() + ": " + entry.getTocId() + ": " + entry.getType();
|
||||
}
|
||||
|
||||
}
|
||||
@ -4,11 +4,13 @@
|
||||
<modelVersion>4.0.0</modelVersion>
|
||||
|
||||
<parent>
|
||||
<groupId>com.knecon.fforesight</groupId>
|
||||
<artifactId>layoutparser</artifactId>
|
||||
<version>1.0.0</version>
|
||||
<groupId>com.iqser.red</groupId>
|
||||
<artifactId>platform-dependency</artifactId>
|
||||
<version>1.17.0</version>
|
||||
<relativePath/>
|
||||
</parent>
|
||||
|
||||
<groupId>com.knecon.fforesight</groupId>
|
||||
<artifactId>layoutparser-service</artifactId>
|
||||
<version>1.0.0</version>
|
||||
|
||||
@ -25,10 +27,12 @@
|
||||
<jackson.version>2.13.2</jackson.version>
|
||||
<slf4j.version>2.0.7</slf4j.version>
|
||||
<pdfbox.version>3.0.0-alpha2</pdfbox.version>
|
||||
<lombok.version>1.18.26</lombok.version>
|
||||
<spring.version>3.0.1</spring.version>
|
||||
<spring.cloud.version>2022.0.1</spring.cloud.version>
|
||||
<jackson.version>2.15.0-rc2</jackson.version>
|
||||
<dsljson.version>1.9.9</dsljson.version>
|
||||
|
||||
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
|
||||
</properties>
|
||||
|
||||
|
||||
5
pom.xml
5
pom.xml
@ -3,11 +3,6 @@
|
||||
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 https://maven.apache.org/xsd/maven-4.0.0.xsd">
|
||||
<modelVersion>4.0.0</modelVersion>
|
||||
|
||||
<parent>
|
||||
<groupId>org.springframework.boot</groupId>
|
||||
<artifactId>spring-boot-starter-parent</artifactId>
|
||||
<version>3.0.1</version>
|
||||
</parent>
|
||||
|
||||
<groupId>com.knecon.fforesight</groupId>
|
||||
<artifactId>layoutparser</artifactId>
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user