TAAS-41/ RED-6725: integrate layoutparser into redactmanager

This commit is contained in:
Kilian Schuettler 2023-07-24 15:48:28 +02:00
parent 9c8501e76a
commit 241a32cb4f
97 changed files with 2641 additions and 300 deletions

View File

@ -8,6 +8,7 @@ import lombok.Builder;
@Builder
public record LayoutParsingRequest(
LayoutParsingType layoutParsingType,
Map<String, String> identifier,
String originFileStorageId,
Optional<String> tablesFileStorageId,
@ -16,6 +17,7 @@ public record LayoutParsingRequest(
String researchDocumentStorageId,
String textBlockFileStorageId,
String positionBlockFileStorageId,
String pageFileStorageId) {
String pageFileStorageId,
String sectionGridStorageId) {
}

View File

@ -0,0 +1,7 @@
package com.knecon.fforesight.service.layoutparser.internal.api.queue;
public enum LayoutParsingType {
REDACT_MANAGER,
TAAS,
DOCUMINE
}

View File

@ -0,0 +1,16 @@
<Configuration>
<Appenders>
<Console name="CONSOLE" target="SYSTEM_OUT">
<PatternLayout pattern="%d{HH:mm:ss.SSS} [%t] %-5level %logger{36} - %msg%n"/>
</Console>
</Appenders>
<Loggers>
<Root level="warn">
<AppenderRef ref="CONSOLE"/>
</Root>
<Logger name="com.iqser" level="info"/>
</Loggers>
</Configuration>

View File

@ -60,6 +60,12 @@
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-amqp</artifactId>
</dependency>
<dependency>
<groupId>org.junit.jupiter</groupId>
<artifactId>junit-jupiter</artifactId>
<version>RELEASE</version>
<scope>test</scope>
</dependency>
</dependencies>
</project>

View File

@ -7,21 +7,24 @@ import java.io.IOException;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.springframework.stereotype.Service;
import com.knecon.fforesight.service.layoutparser.internal.api.data.taas.ResearchDocumentData;
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingFinishedEvent;
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingRequest;
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType;
import com.knecon.fforesight.service.layoutparser.processor.adapter.CvTableParsingAdapter;
import com.knecon.fforesight.service.layoutparser.processor.adapter.ImageServiceResponseAdapter;
import com.knecon.fforesight.service.layoutparser.processor.adapter.model.image.ImageServiceResponse;
import com.knecon.fforesight.service.layoutparser.processor.adapter.model.table.TableServiceResponse;
import com.knecon.fforesight.service.layoutparser.processor.classification.model.ClassificationDocument;
import com.knecon.fforesight.service.layoutparser.processor.classification.service.ClassificationService;
import com.knecon.fforesight.service.layoutparser.processor.classification.service.PdfParsingService;
import com.knecon.fforesight.service.layoutparser.processor.classification.service.SectionsBuilderService;
import com.knecon.fforesight.service.layoutparser.processor.factory.DocumentGraphFactory;
import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.Document;
import com.knecon.fforesight.service.layoutparser.processor.mapper.redaction.DocumentDataMapper;
import com.knecon.fforesight.service.layoutparser.processor.mapper.taas.TaasDocumentDataMapper;
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationDocument;
import com.knecon.fforesight.service.layoutparser.processor.services.PdfParsingService;
import com.knecon.fforesight.service.layoutparser.processor.services.SectionGridCreatorService;
import com.knecon.fforesight.service.layoutparser.processor.services.SectionsBuilderService;
import com.knecon.fforesight.service.layoutparser.processor.services.classification.DocuMineClassificationService;
import com.knecon.fforesight.service.layoutparser.processor.services.classification.RedactManagerClassificationService;
import com.knecon.fforesight.service.layoutparser.processor.services.classification.TaasClassificationService;
import lombok.RequiredArgsConstructor;
import lombok.extern.slf4j.Slf4j;
@ -29,14 +32,17 @@ import lombok.extern.slf4j.Slf4j;
@Slf4j
@Service
@RequiredArgsConstructor
public class LayoutParsingService {
public class LayoutParsingPipeline {
private final ImageServiceResponseAdapter imageServiceResponseAdapter;
private final CvTableParsingAdapter cvTableParsingAdapter;
private final LayoutParsingStorageService layoutParsingStorageService;
private final PdfParsingService pdfParsingService;
private final ClassificationService classificationService;
private final SectionsBuilderService sectionsBuilderService;
private final SectionGridCreatorService sectionGridCreatorService;
private final TaasClassificationService taasClassificationService;
private final RedactManagerClassificationService redactManagerClassificationService;
private final DocuMineClassificationService docuMineClassificationService;
public LayoutParsingFinishedEvent parseLayoutAndSaveFilesToStorage(LayoutParsingRequest layoutParsingRequest) throws IOException {
@ -54,13 +60,17 @@ public class LayoutParsingService {
tableServiceResponse = layoutParsingStorageService.getTablesFile(layoutParsingRequest.pageFileStorageId());
}
Document documentGraph = parseLayout(originDocument, imageServiceResponse, tableServiceResponse);
Document documentGraph = parseLayout(layoutParsingRequest.layoutParsingType(), originDocument, imageServiceResponse, tableServiceResponse);
int numberOfPages = originDocument.getNumberOfPages();
originDocument.close();
var researchDocumentData = TaasDocumentDataMapper.fromDocument(documentGraph);
layoutParsingStorageService.storeSectionGrid(layoutParsingRequest, sectionGridCreatorService.createSectionGrid(documentGraph));
layoutParsingStorageService.storeDocumentData(layoutParsingRequest, DocumentDataMapper.toDocumentData(documentGraph));
layoutParsingStorageService.storeDocumentData(layoutParsingRequest, researchDocumentData, DocumentDataMapper.toDocumentData(documentGraph));
if (layoutParsingRequest.layoutParsingType().equals(LayoutParsingType.TAAS)) {
var researchDocumentData = TaasDocumentDataMapper.fromDocument(documentGraph);
layoutParsingStorageService.storeResearchDocumentData(layoutParsingRequest, researchDocumentData);
}
return LayoutParsingFinishedEvent.builder()
.identifier(layoutParsingRequest.identifier())
@ -75,13 +85,21 @@ public class LayoutParsingService {
}
public Document parseLayout(PDDocument originDocument, ImageServiceResponse imageServiceResponse, TableServiceResponse tableServiceResponse) {
public Document parseLayout(LayoutParsingType layoutParsingType,
PDDocument originDocument,
ImageServiceResponse imageServiceResponse,
TableServiceResponse tableServiceResponse) {
ClassificationDocument classificationDocument = pdfParsingService.parseDocument(originDocument,
ClassificationDocument classificationDocument = pdfParsingService.parseDocument(layoutParsingType,
originDocument,
cvTableParsingAdapter.buildCvParsedTablesPerPage(tableServiceResponse),
imageServiceResponseAdapter.buildClassifiedImagesPerPage(imageServiceResponse));
classificationService.classifyDocument(classificationDocument);
switch (layoutParsingType) {
case TAAS -> taasClassificationService.classifyDocument(classificationDocument);
case DOCUMINE -> docuMineClassificationService.classifyDocument(classificationDocument);
case REDACT_MANAGER -> redactManagerClassificationService.classifyDocument(classificationDocument);
}
sectionsBuilderService.buildSections(classificationDocument);
@ -89,16 +107,25 @@ public class LayoutParsingService {
}
public Document parseLayoutWithTimer(PDDocument originDocument, ImageServiceResponse imageServiceResponse, TableServiceResponse tableServiceResponse) {
public Document parseLayoutWithTimer(LayoutParsingType layoutParsingType,
PDDocument originDocument,
ImageServiceResponse imageServiceResponse,
TableServiceResponse tableServiceResponse) {
long start = System.currentTimeMillis();
ClassificationDocument classificationDocument = pdfParsingService.parseDocument(originDocument,
ClassificationDocument classificationDocument = pdfParsingService.parseDocument(layoutParsingType, originDocument,
cvTableParsingAdapter.buildCvParsedTablesPerPage(tableServiceResponse),
imageServiceResponseAdapter.buildClassifiedImagesPerPage(imageServiceResponse));
System.out.printf("parsed %d ms", System.currentTimeMillis() - start);
start = System.currentTimeMillis();
classificationService.classifyDocument(classificationDocument);
switch (layoutParsingType) {
case TAAS -> taasClassificationService.classifyDocument(classificationDocument);
case DOCUMINE -> docuMineClassificationService.classifyDocument(classificationDocument);
case REDACT_MANAGER -> redactManagerClassificationService.classifyDocument(classificationDocument);
}
System.out.printf(", classified %d ms", System.currentTimeMillis() - start);
start = System.currentTimeMillis();

View File

@ -13,6 +13,7 @@ import org.apache.pdfbox.pdmodel.PDDocument;
import org.springframework.stereotype.Service;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.section.SectionGrid;
import com.iqser.red.storage.commons.service.StorageService;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.AtomicPositionBlockData;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.AtomicTextBlockData;
@ -68,14 +69,24 @@ public class LayoutParsingStorageService {
}
public void storeDocumentData(LayoutParsingRequest layoutParsingRequest, ResearchDocumentData researchDocumentData, DocumentData documentData) {
public void storeDocumentData(LayoutParsingRequest layoutParsingRequest, DocumentData documentData) {
storageService.storeJSONObject(TenantContext.getTenantId(), layoutParsingRequest.researchDocumentStorageId(), researchDocumentData);
storageService.storeJSONObject(TenantContext.getTenantId(), layoutParsingRequest.structureFileStorageId(), documentData.getDocumentTreeData());
storageService.storeJSONObject(TenantContext.getTenantId(), layoutParsingRequest.textBlockFileStorageId(), documentData.getAtomicTextBlocks());
storageService.storeJSONObject(TenantContext.getTenantId(), layoutParsingRequest.positionBlockFileStorageId(), documentData.getAtomicPositionBlocks());
storageService.storeJSONObject(TenantContext.getTenantId(), layoutParsingRequest.pageFileStorageId(), documentData.getPages());
}
public void storeSectionGrid(LayoutParsingRequest layoutParsingRequest, SectionGrid sectionGrid) {
storageService.storeJSONObject(TenantContext.getTenantId(), layoutParsingRequest.sectionGridStorageId(), sectionGrid);
}
public void storeResearchDocumentData(LayoutParsingRequest layoutParsingRequest, ResearchDocumentData researchDocumentData) {
storageService.storeJSONObject(TenantContext.getTenantId(), layoutParsingRequest.researchDocumentStorageId(), researchDocumentData);
}
@ -88,9 +99,7 @@ public class LayoutParsingStorageService {
AtomicPositionBlockData[] atomicPositionBlockData = storageService.readJSONObject(TenantContext.getTenantId(),
layoutParsingRequest.positionBlockFileStorageId(),
AtomicPositionBlockData[].class);
DocumentTreeData tableOfContentsData = storageService.readJSONObject(TenantContext.getTenantId(),
layoutParsingRequest.structureFileStorageId(),
DocumentTreeData.class);
DocumentTreeData tableOfContentsData = storageService.readJSONObject(TenantContext.getTenantId(), layoutParsingRequest.structureFileStorageId(), DocumentTreeData.class);
return DocumentData.builder()
.documentTreeData(tableOfContentsData)

View File

@ -10,8 +10,8 @@ import java.util.Map;
import org.springframework.stereotype.Service;
import com.knecon.fforesight.service.layoutparser.processor.adapter.model.image.ImageServiceResponse;
import com.knecon.fforesight.service.layoutparser.processor.classification.model.ClassificationPage;
import com.knecon.fforesight.service.layoutparser.processor.classification.model.image.ClassifiedImage;
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
import com.knecon.fforesight.service.layoutparser.processor.model.image.ClassifiedImage;
import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.ImageType;
import lombok.RequiredArgsConstructor;

View File

@ -1,8 +0,0 @@
package com.knecon.fforesight.service.layoutparser.processor.classification.model;
public enum Orientation {
NONE,
LEFT,
RIGHT
}

View File

@ -13,13 +13,13 @@ import java.util.Map;
import java.util.NoSuchElementException;
import java.util.Set;
import com.knecon.fforesight.service.layoutparser.processor.classification.model.AbstractPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.classification.model.ClassificationDocument;
import com.knecon.fforesight.service.layoutparser.processor.classification.model.ClassificationFooter;
import com.knecon.fforesight.service.layoutparser.processor.classification.model.ClassificationHeader;
import com.knecon.fforesight.service.layoutparser.processor.classification.model.ClassificationPage;
import com.knecon.fforesight.service.layoutparser.processor.classification.model.image.ClassifiedImage;
import com.knecon.fforesight.service.layoutparser.processor.classification.model.text.TextPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationDocument;
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationFooter;
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationHeader;
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
import com.knecon.fforesight.service.layoutparser.processor.model.image.ClassifiedImage;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.graph.DocumentTree;
import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.Document;
import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.Footer;
@ -81,8 +81,9 @@ public class DocumentGraphFactory {
page.getMainBody().add(node);
List<TextPageBlock> textBlocks = new ArrayList<>(textBlocksToMerge);
List<TextPageBlock> textBlocks = new ArrayList<>();
textBlocks.add(originalTextBlock);
textBlocks.addAll(textBlocksToMerge);
AtomicTextBlock textBlock = context.textBlockFactory.buildAtomicTextBlock(TextPositionOperations.mergeAndSortTextPositionSequenceByYThenX(textBlocks), node, context, page);
List<Integer> treeId = context.documentTree.createNewChildEntryAndReturnId(parentNode, node);
node.setLeafTextBlock(textBlock);

View File

@ -7,9 +7,9 @@ import java.util.LinkedList;
import java.util.List;
import java.util.Objects;
import com.knecon.fforesight.service.layoutparser.processor.classification.model.text.RedTextPosition;
import com.knecon.fforesight.service.layoutparser.processor.classification.model.text.TextDirection;
import com.knecon.fforesight.service.layoutparser.processor.classification.model.text.TextPositionSequence;
import com.knecon.fforesight.service.layoutparser.processor.model.text.RedTextPosition;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextDirection;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
import com.knecon.fforesight.service.layoutparser.processor.graph.Boundary;
import lombok.experimental.UtilityClass;

View File

@ -10,10 +10,10 @@ import java.util.List;
import java.util.Map;
import java.util.Set;
import com.knecon.fforesight.service.layoutparser.processor.classification.model.AbstractPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.classification.model.image.ClassifiedImage;
import com.knecon.fforesight.service.layoutparser.processor.classification.model.table.TablePageBlock;
import com.knecon.fforesight.service.layoutparser.processor.classification.model.text.TextPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.image.ClassifiedImage;
import com.knecon.fforesight.service.layoutparser.processor.model.table.TablePageBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.GenericSemanticNode;
import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.Page;
import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.Section;
@ -80,7 +80,7 @@ public class SectionNodeFactory {
remainingBlocks.removeAll(alreadyMerged);
if (abstractPageBlock instanceof TextPageBlock) {
List<TextPageBlock> textBlocks = findTextBlocksWithSameClassificationAndAlignsY(abstractPageBlock, remainingBlocks);
List<TextPageBlock> textBlocks = findTextBlocksWithSameClassificationAndAlignsY((TextPageBlock) abstractPageBlock, remainingBlocks);
alreadyMerged.addAll(textBlocks);
DocumentGraphFactory.addParagraphOrHeadline(section, (TextPageBlock) abstractPageBlock, context, textBlocks);
} else if (abstractPageBlock instanceof TablePageBlock tablePageBlock) {
@ -123,7 +123,7 @@ public class SectionNodeFactory {
List<AbstractPageBlock> previousList = splitList.get(i - 1);
AbstractPageBlock lastPageBlockInPreviousList = previousList.get(previousList.size() - 1);
if (lastPageBlockInPreviousList.isHeadline()) {
previousList.remove(i - 1);
previousList.remove(previousList.size() - 1);
splitList.get(i).add(0, lastPageBlockInPreviousList);
}
}
@ -162,7 +162,7 @@ public class SectionNodeFactory {
}
private List<TextPageBlock> findTextBlocksWithSameClassificationAndAlignsY(AbstractPageBlock atc, List<AbstractPageBlock> pageBlocks) {
private List<TextPageBlock> findTextBlocksWithSameClassificationAndAlignsY(TextPageBlock atc, List<AbstractPageBlock> pageBlocks) {
return pageBlocks.stream()
.filter(abstractTextContainer -> !abstractTextContainer.equals(atc))
@ -170,6 +170,7 @@ public class SectionNodeFactory {
.filter(abstractTextContainer -> abstractTextContainer instanceof TextPageBlock)
.filter(abstractTextContainer -> abstractTextContainer.intersectsY(atc))
.map(abstractTextContainer -> (TextPageBlock) abstractTextContainer)
.filter(abstractTextContainer -> abstractTextContainer.getDir() == atc.getDir())
.toList();
}

View File

@ -7,10 +7,10 @@ import java.util.List;
import java.util.Set;
import java.util.stream.Collectors;
import com.knecon.fforesight.service.layoutparser.processor.classification.model.AbstractPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.classification.model.table.Cell;
import com.knecon.fforesight.service.layoutparser.processor.classification.model.table.TablePageBlock;
import com.knecon.fforesight.service.layoutparser.processor.classification.model.text.TextPositionSequence;
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.table.Cell;
import com.knecon.fforesight.service.layoutparser.processor.model.table.TablePageBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.GenericSemanticNode;
import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.Page;
import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.SemanticNode;

View File

@ -2,7 +2,7 @@ package com.knecon.fforesight.service.layoutparser.processor.factory;
import java.util.List;
import com.knecon.fforesight.service.layoutparser.processor.classification.model.text.TextPositionSequence;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.Page;
import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.SemanticNode;
import com.knecon.fforesight.service.layoutparser.processor.graph.textblock.AtomicTextBlock;

View File

@ -5,6 +5,7 @@ import static java.lang.String.format;
import java.util.Collection;
import java.util.LinkedList;
import java.util.List;
import java.util.stream.IntStream;
import lombok.EqualsAndHashCode;
import lombok.Setter;
@ -107,6 +108,10 @@ public class Boundary implements Comparable<Boundary> {
return splitBoundaries;
}
public IntStream intStream() {
return IntStream.range(start, end);
}
public static Boundary merge(Collection<Boundary> boundaries) {

View File

@ -5,7 +5,7 @@ import java.util.LinkedList;
import java.util.List;
import java.util.Set;
import com.knecon.fforesight.service.layoutparser.processor.classification.model.ClassificationPage;
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
import com.knecon.fforesight.service.layoutparser.processor.graph.entity.RedactionEntity;
import com.knecon.fforesight.service.layoutparser.processor.graph.textblock.TextBlock;
import com.knecon.fforesight.service.layoutparser.processor.graph.textblock.TextBlockCollector;

View File

@ -18,7 +18,7 @@ import com.knecon.fforesight.service.layoutparser.processor.graph.entity.EntityT
import com.knecon.fforesight.service.layoutparser.processor.graph.entity.RedactionEntity;
import com.knecon.fforesight.service.layoutparser.processor.graph.textblock.AtomicTextBlock;
import com.knecon.fforesight.service.layoutparser.processor.graph.textblock.TextBlock;
import com.knecon.fforesight.service.layoutparser.processor.services.RectangleTransformations;
import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations;
public interface SemanticNode {

View File

@ -10,7 +10,7 @@ import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.Image;
import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.ImageType;
import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.TableCell;
import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.Table;
import com.knecon.fforesight.service.layoutparser.processor.services.RectangleTransformations;
import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations;
public class PropertiesMapper {

View File

@ -1,8 +1,8 @@
package com.knecon.fforesight.service.layoutparser.processor.classification.model;
package com.knecon.fforesight.service.layoutparser.processor.model;
import com.fasterxml.jackson.annotation.JsonIgnore;
import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.Rectangle;
import com.knecon.fforesight.service.layoutparser.processor.classification.model.text.TextPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
import lombok.AllArgsConstructor;
import lombok.Data;

View File

@ -1,11 +1,11 @@
package com.knecon.fforesight.service.layoutparser.processor.classification.model;
package com.knecon.fforesight.service.layoutparser.processor.model;
import java.util.ArrayList;
import java.util.List;
import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.section.SectionGrid;
import com.knecon.fforesight.service.layoutparser.processor.classification.model.text.StringFrequencyCounter;
import com.knecon.fforesight.service.layoutparser.processor.classification.model.text.UnclassifiedText;
import com.knecon.fforesight.service.layoutparser.processor.model.text.StringFrequencyCounter;
import com.knecon.fforesight.service.layoutparser.processor.model.text.UnclassifiedText;
import lombok.Data;
import lombok.NoArgsConstructor;

View File

@ -1,8 +1,8 @@
package com.knecon.fforesight.service.layoutparser.processor.classification.model;
package com.knecon.fforesight.service.layoutparser.processor.model;
import java.util.List;
import com.knecon.fforesight.service.layoutparser.processor.classification.model.text.TextPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
import lombok.AllArgsConstructor;
import lombok.Data;

View File

@ -1,8 +1,8 @@
package com.knecon.fforesight.service.layoutparser.processor.classification.model;
package com.knecon.fforesight.service.layoutparser.processor.model;
import java.util.List;
import com.knecon.fforesight.service.layoutparser.processor.classification.model.text.TextPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
import lombok.AllArgsConstructor;
import lombok.Data;

View File

@ -1,11 +1,11 @@
package com.knecon.fforesight.service.layoutparser.processor.classification.model;
package com.knecon.fforesight.service.layoutparser.processor.model;
import java.util.ArrayList;
import java.util.List;
import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.Rectangle;
import com.knecon.fforesight.service.layoutparser.processor.classification.model.image.ClassifiedImage;
import com.knecon.fforesight.service.layoutparser.processor.classification.model.text.StringFrequencyCounter;
import com.knecon.fforesight.service.layoutparser.processor.model.image.ClassifiedImage;
import com.knecon.fforesight.service.layoutparser.processor.model.text.StringFrequencyCounter;
import lombok.Data;
import lombok.NonNull;

View File

@ -1,10 +1,10 @@
package com.knecon.fforesight.service.layoutparser.processor.classification.model;
package com.knecon.fforesight.service.layoutparser.processor.model;
import java.util.ArrayList;
import java.util.List;
import com.knecon.fforesight.service.layoutparser.processor.classification.model.image.ClassifiedImage;
import com.knecon.fforesight.service.layoutparser.processor.classification.model.table.TablePageBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.image.ClassifiedImage;
import com.knecon.fforesight.service.layoutparser.processor.model.table.TablePageBlock;
import lombok.Data;
import lombok.NoArgsConstructor;

View File

@ -1,4 +1,4 @@
package com.knecon.fforesight.service.layoutparser.processor.classification.model;
package com.knecon.fforesight.service.layoutparser.processor.model;
import java.util.ArrayList;
import java.util.Collections;

View File

@ -0,0 +1,23 @@
package com.knecon.fforesight.service.layoutparser.processor.model;
import java.awt.geom.Rectangle2D;
import java.util.LinkedList;
import java.util.List;
import lombok.AccessLevel;
import lombok.AllArgsConstructor;
import lombok.Getter;
import lombok.experimental.FieldDefaults;
@Getter
@AllArgsConstructor
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
public class Gaps {
List<List<Rectangle2D>> xGaps ;
List<List<Rectangle2D>> yGaps ;
public Gaps() {
xGaps = new LinkedList<>();
yGaps = new LinkedList<>();
}
}

View File

@ -0,0 +1,5 @@
package com.knecon.fforesight.service.layoutparser.processor.model;
public class LineInformation {
}

View File

@ -0,0 +1,8 @@
package com.knecon.fforesight.service.layoutparser.processor.model;
public enum Orientation {
NONE,
LEFT,
RIGHT
}

View File

@ -1,4 +1,4 @@
package com.knecon.fforesight.service.layoutparser.processor.classification.model;
package com.knecon.fforesight.service.layoutparser.processor.model;
public enum PageBlockType {
H1,

View File

@ -0,0 +1,20 @@
package com.knecon.fforesight.service.layoutparser.processor.model;
import java.awt.geom.Rectangle2D;
import java.util.List;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
import lombok.AllArgsConstructor;
import lombok.Builder;
import lombok.Getter;
@Getter
@Builder
@AllArgsConstructor
public class PageInformation {
List<TextPositionSequence> sortedTextPositionSequences;
Rectangle2D cropBox;
}

View File

@ -0,0 +1,5 @@
package com.knecon.fforesight.service.layoutparser.processor.model;
public class PageInformation {
}

View File

@ -0,0 +1,123 @@
package com.knecon.fforesight.service.layoutparser.processor.model;
import java.util.Collections;
import java.util.LinkedList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import lombok.AccessLevel;
import lombok.AllArgsConstructor;
import lombok.experimental.FieldDefaults;
@AllArgsConstructor
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
public class SectionIdentifier {
static Pattern numericalIdentifierPattern = Pattern.compile("^[\\s]?(\\d+)[\\s.,;]?(\\d+)?[\\s.,;]?(\\d+)?[\\s.,;]?(\\d+)?");
private enum Format {
EMPTY,
NUMERICAL,
DOCUMENT
}
Format format;
String identifierString;
List<Integer> identifiers;
boolean asChild;
public static SectionIdentifier fromSearchText(String headline) {
if (headline == null || headline.isEmpty() || headline.isBlank()) {
return SectionIdentifier.empty();
}
Matcher numericalIdentifierMatcher = numericalIdentifierPattern.matcher(headline);
if (numericalIdentifierMatcher.find()) {
return buildNumericalSectionIdentifier(headline, numericalIdentifierMatcher);
}
// more formats here
return SectionIdentifier.empty();
}
public static SectionIdentifier asChildOf(SectionIdentifier sectionIdentifier) {
return new SectionIdentifier(sectionIdentifier.format, sectionIdentifier.toString(), sectionIdentifier.identifiers, true);
}
public static SectionIdentifier document() {
return new SectionIdentifier(Format.DOCUMENT, "document", Collections.emptyList(), false);
}
public static SectionIdentifier empty() {
return new SectionIdentifier(Format.EMPTY, "empty", Collections.emptyList(), false);
}
private static SectionIdentifier buildNumericalSectionIdentifier(String headline, Matcher numericalIdentifierMatcher) {
String identifierString = headline.substring(numericalIdentifierMatcher.start(), numericalIdentifierMatcher.end());
List<Integer> identifiers = new LinkedList<>();
for (int i = 1; i <= 4; i++) {
String numericalIdentifier = numericalIdentifierMatcher.group(i);
if (numericalIdentifier == null || numericalIdentifier.equals("0") || numericalIdentifier.isEmpty() || numericalIdentifier.isBlank()) {
break;
}
identifiers.add(Integer.parseInt(numericalIdentifier.trim()));
}
return new SectionIdentifier(Format.NUMERICAL, identifierString, identifiers.stream().toList(), false);
}
/**
* Determines if the current section is the parent of the given section.
*
* @param sectionIdentifier The section identifier to compare against.
* @return true if the current section is the parent of the given section, false otherwise.
*/
public boolean isParentOf(SectionIdentifier sectionIdentifier) {
if (this.format.equals(Format.EMPTY)) {
return false;
}
if (this.format.equals(Format.DOCUMENT)) {
return true;
}
if (!this.format.equals(sectionIdentifier.format)) {
return false;
}
if (this.identifiers.size() >= sectionIdentifier.identifiers.size() && !(this.identifiers.size() == sectionIdentifier.identifiers.size() && sectionIdentifier.asChild)) {
return false;
}
for (int i = 0; i < this.identifiers.size(); i++) {
if (!this.identifiers.get(i).equals(sectionIdentifier.identifiers.get(i))) {
return false;
}
}
return true;
}
public boolean isChildOf(SectionIdentifier sectionIdentifier) {
if (this.format.equals(Format.DOCUMENT) || this.format.equals(Format.EMPTY)) {
return false;
}
return sectionIdentifier.isParentOf(this);
}
@Override
public String toString() {
return identifierString;
}
}

View File

@ -1,4 +1,4 @@
package com.knecon.fforesight.service.layoutparser.processor.classification.model.image;
package com.knecon.fforesight.service.layoutparser.processor.model.image;
import java.awt.geom.Rectangle2D;

View File

@ -1,13 +1,13 @@
package com.knecon.fforesight.service.layoutparser.processor.classification.model.table;
package com.knecon.fforesight.service.layoutparser.processor.model.table;
import java.awt.geom.Point2D;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import com.knecon.fforesight.service.layoutparser.processor.classification.model.text.TextPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.classification.model.text.TextPositionSequence;
import com.knecon.fforesight.service.layoutparser.processor.classification.utils.TextNormalizationUtilities;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
import com.knecon.fforesight.service.layoutparser.processor.utils.TextNormalizationUtilities;
import lombok.Data;
import lombok.EqualsAndHashCode;

View File

@ -1,4 +1,4 @@
package com.knecon.fforesight.service.layoutparser.processor.classification.model.table;
package com.knecon.fforesight.service.layoutparser.processor.model.table;
import lombok.RequiredArgsConstructor;
import lombok.Value;

View File

@ -1,4 +1,4 @@
package com.knecon.fforesight.service.layoutparser.processor.classification.model.table;
package com.knecon.fforesight.service.layoutparser.processor.model.table;
import java.util.List;

View File

@ -1,4 +1,4 @@
package com.knecon.fforesight.service.layoutparser.processor.classification.model.table;
package com.knecon.fforesight.service.layoutparser.processor.model.table;
import java.awt.geom.Point2D;
import java.awt.geom.Rectangle2D;

View File

@ -1,4 +1,4 @@
package com.knecon.fforesight.service.layoutparser.processor.classification.model.table;
package com.knecon.fforesight.service.layoutparser.processor.model.table;
import java.awt.geom.Line2D;
import java.awt.geom.Point2D;
@ -11,8 +11,8 @@ import java.util.List;
import java.util.Map;
import java.util.TreeMap;
import com.knecon.fforesight.service.layoutparser.processor.classification.utils.CohenSutherlandClipping;
import com.knecon.fforesight.service.layoutparser.processor.classification.utils.DoubleComparisons;
import com.knecon.fforesight.service.layoutparser.processor.utils.CohenSutherlandClipping;
import com.knecon.fforesight.service.layoutparser.processor.utils.DoubleComparisons;
import lombok.extern.slf4j.Slf4j;

View File

@ -1,4 +1,4 @@
package com.knecon.fforesight.service.layoutparser.processor.classification.model.table;
package com.knecon.fforesight.service.layoutparser.processor.model.table;
import java.awt.geom.Point2D;
import java.util.ArrayList;
@ -8,9 +8,9 @@ import java.util.List;
import java.util.Set;
import java.util.TreeMap;
import com.knecon.fforesight.service.layoutparser.processor.classification.model.AbstractPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.classification.model.PageBlockType;
import com.knecon.fforesight.service.layoutparser.processor.classification.model.text.TextPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.PageBlockType;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
import lombok.Getter;
import lombok.Setter;

View File

@ -1,4 +1,4 @@
package com.knecon.fforesight.service.layoutparser.processor.classification.model.text;
package com.knecon.fforesight.service.layoutparser.processor.model.text;
import org.apache.pdfbox.text.TextPosition;
import org.springframework.beans.BeanUtils;

View File

@ -1,8 +1,8 @@
package com.knecon.fforesight.service.layoutparser.processor.classification.model.text;
package com.knecon.fforesight.service.layoutparser.processor.model.text;
import java.util.ArrayList;
import java.util.List;
import com.knecon.fforesight.service.layoutparser.processor.classification.utils.TextNormalizationUtilities;
import com.knecon.fforesight.service.layoutparser.processor.utils.TextNormalizationUtilities;
import lombok.Getter;

View File

@ -1,4 +1,4 @@
package com.knecon.fforesight.service.layoutparser.processor.classification.model.text;
package com.knecon.fforesight.service.layoutparser.processor.model.text;
import lombok.AllArgsConstructor;
import lombok.Builder;

View File

@ -1,4 +1,4 @@
package com.knecon.fforesight.service.layoutparser.processor.classification.model.text;
package com.knecon.fforesight.service.layoutparser.processor.model.text;
import java.util.ArrayList;
import java.util.List;

View File

@ -1,4 +1,4 @@
package com.knecon.fforesight.service.layoutparser.processor.classification.model.text;
package com.knecon.fforesight.service.layoutparser.processor.model.text;
import java.util.HashMap;
import java.util.Map;

View File

@ -1,4 +1,4 @@
package com.knecon.fforesight.service.layoutparser.processor.classification.model.text;
package com.knecon.fforesight.service.layoutparser.processor.model.text;
import com.fasterxml.jackson.annotation.JsonCreator;
import com.fasterxml.jackson.annotation.JsonValue;

View File

@ -1,4 +1,4 @@
package com.knecon.fforesight.service.layoutparser.processor.classification.model.text;
package com.knecon.fforesight.service.layoutparser.processor.model.text;
import static java.util.stream.Collectors.toSet;
@ -7,11 +7,11 @@ import java.util.Comparator;
import java.util.List;
import com.fasterxml.jackson.annotation.JsonIgnore;
import com.knecon.fforesight.service.layoutparser.processor.classification.model.AbstractPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.classification.model.FloatFrequencyCounter;
import com.knecon.fforesight.service.layoutparser.processor.classification.model.PageBlockType;
import com.knecon.fforesight.service.layoutparser.processor.classification.utils.DoubleComparisons;
import com.knecon.fforesight.service.layoutparser.processor.classification.utils.TextNormalizationUtilities;
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.FloatFrequencyCounter;
import com.knecon.fforesight.service.layoutparser.processor.model.PageBlockType;
import com.knecon.fforesight.service.layoutparser.processor.utils.DoubleComparisons;
import com.knecon.fforesight.service.layoutparser.processor.utils.TextNormalizationUtilities;
import lombok.AllArgsConstructor;
import lombok.Builder;

View File

@ -1,4 +1,4 @@
package com.knecon.fforesight.service.layoutparser.processor.classification.model.text;
package com.knecon.fforesight.service.layoutparser.processor.model.text;
import java.awt.geom.AffineTransform;
import java.awt.geom.Point2D;

View File

@ -14,7 +14,7 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.knecon.fforesight.service.layoutparser.processor.classification.parsing;
package com.knecon.fforesight.service.layoutparser.processor.parsing;
import java.io.IOException;
import java.io.InputStream;

View File

@ -1,4 +1,4 @@
package com.knecon.fforesight.service.layoutparser.processor.classification.parsing;
package com.knecon.fforesight.service.layoutparser.processor.parsing;
import java.awt.color.CMMException;
import java.awt.geom.Point2D;
@ -35,9 +35,9 @@ import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.graphics.color.PDColor;
import org.apache.pdfbox.text.TextPosition;
import com.knecon.fforesight.service.layoutparser.processor.classification.model.table.Ruling;
import com.knecon.fforesight.service.layoutparser.processor.classification.model.text.RedTextPosition;
import com.knecon.fforesight.service.layoutparser.processor.classification.model.text.TextPositionSequence;
import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling;
import com.knecon.fforesight.service.layoutparser.processor.model.text.RedTextPosition;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
import lombok.Getter;
import lombok.Setter;

View File

@ -14,7 +14,7 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.knecon.fforesight.service.layoutparser.processor.classification.parsing;
package com.knecon.fforesight.service.layoutparser.processor.parsing;
import java.io.BufferedInputStream;
import java.io.IOException;

View File

@ -10,11 +10,10 @@ import org.springframework.amqp.rabbit.annotation.RabbitListener;
import org.springframework.amqp.rabbit.core.RabbitTemplate;
import org.springframework.stereotype.Service;
import com.fasterxml.jackson.core.JsonProcessingException;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingFinishedEvent;
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingRequest;
import com.knecon.fforesight.service.layoutparser.processor.LayoutParsingService;
import com.knecon.fforesight.service.layoutparser.processor.LayoutParsingPipeline;
import lombok.RequiredArgsConstructor;
import lombok.SneakyThrows;
@ -25,7 +24,7 @@ import lombok.extern.slf4j.Slf4j;
@RequiredArgsConstructor
public class MessageHandler {
private final LayoutParsingService layoutParsingService;
private final LayoutParsingPipeline layoutParsingPipeline;
private final ObjectMapper objectMapper;
private final RabbitTemplate rabbitTemplate;
@ -42,7 +41,7 @@ public class MessageHandler {
throw new AmqpRejectAndDontRequeueException(String.format("Error during last layout parsing of request with identifier: %s, do not retry.",
layoutParsingRequest.identifier()));
}
LayoutParsingFinishedEvent layoutParsingFinishedEvent = layoutParsingService.parseLayoutAndSaveFilesToStorage(layoutParsingRequest);
LayoutParsingFinishedEvent layoutParsingFinishedEvent = layoutParsingPipeline.parseLayoutAndSaveFilesToStorage(layoutParsingRequest);
sendLayoutParsingFinishedEvent(layoutParsingFinishedEvent);
log.info("Layout parsing finished {} in {} ms", layoutParsingRequest.identifier(), layoutParsingFinishedEvent.duration());
}

View File

@ -1,4 +1,4 @@
package com.knecon.fforesight.service.layoutparser.processor.classification.service;
package com.knecon.fforesight.service.layoutparser.processor.services;
import java.util.List;
@ -6,13 +6,13 @@ import org.springframework.stereotype.Service;
import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.Point;
import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.Rectangle;
import com.knecon.fforesight.service.layoutparser.processor.classification.model.AbstractPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.classification.model.ClassificationPage;
import com.knecon.fforesight.service.layoutparser.processor.classification.model.FloatFrequencyCounter;
import com.knecon.fforesight.service.layoutparser.processor.classification.model.table.Cell;
import com.knecon.fforesight.service.layoutparser.processor.classification.model.table.TablePageBlock;
import com.knecon.fforesight.service.layoutparser.processor.classification.model.text.TextPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.classification.utils.PositionUtils;
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
import com.knecon.fforesight.service.layoutparser.processor.model.FloatFrequencyCounter;
import com.knecon.fforesight.service.layoutparser.processor.model.table.Cell;
import com.knecon.fforesight.service.layoutparser.processor.model.table.TablePageBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.utils.PositionUtils;
@Service
public class BodyTextFrameService {

View File

@ -0,0 +1,149 @@
package com.knecon.fforesight.service.layoutparser.processor.services;
import java.awt.geom.Rectangle2D;
import java.util.Collection;
import java.util.Comparator;
import java.util.HashMap;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.stream.IntStream;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
import lombok.experimental.UtilityClass;
@UtilityClass
public class DividingColumnDetectionService {
private static final double SPLITTABLE_LINE_PERCENTAGE_THRESHOLD = 0.6;
private static final int MAX_NUMBER_OF_COLUMNS = 4;
public List<Rectangle2D> detectColumns(List<TextPositionSequence> textPositionSequences, Rectangle2D mainBodyTextFrame) {
if (textPositionSequences.size() < 2) {
return List.of(mainBodyTextFrame);
}
List<List<Rectangle2D>> linesWithGaps = LineDetectionService.findTextBlockInLines(textPositionSequences);
Map<Integer, List<Integer>> linesWithMatchingGapIndices = new HashMap<>();
for (int numberOfColumns = 2; numberOfColumns <= MAX_NUMBER_OF_COLUMNS; numberOfColumns++) {
linesWithMatchingGapIndices.put(numberOfColumns, findConsecutiveLinesWithMatchingGaps(linesWithGaps, mainBodyTextFrame.getWidth(), numberOfColumns));
}
int optimalNumberOfColumns = findOptimalNumberOfColumns(linesWithMatchingGapIndices, linesWithGaps.size());
if (optimalNumberOfColumns == 1) {
return List.of(mainBodyTextFrame);
}
return buildColumns(mainBodyTextFrame, getLinesWithMatchingGaps(linesWithMatchingGapIndices.get(optimalNumberOfColumns), linesWithGaps), optimalNumberOfColumns);
}
private static List<Integer> findConsecutiveLinesWithMatchingGaps(List<List<Rectangle2D>> linesWithGaps, double width, int numberOfColumns) {
List<Boolean> booleans = lineHasMatchingGap(linesWithGaps, width, numberOfColumns);
return findConsecutiveTrueIndicesWithMaxLengthRun(booleans);
}
private List<Boolean> lineHasMatchingGap(List<List<Rectangle2D>> linesWithGaps, double width, int numberOfColumns) {
return linesWithGaps.stream()
.map(blocksWithGaps -> IntStream.range(1, numberOfColumns)
.allMatch(columnIndex -> noBlocksIntersectX(blocksWithGaps, calculateGapLocation(width, numberOfColumns, columnIndex))))
.toList();
}
private List<Integer> findConsecutiveTrueIndicesWithMaxLengthRun(List<Boolean> booleans) {
List<Integer> maxConsecutiveTrueIndices = new LinkedList<>();
List<Integer> currentConsecutiveTrueIndices = new LinkedList<>();
for (int i = 0; i < booleans.size(); i++) {
if (!booleans.get(i)) {
if (currentConsecutiveTrueIndices.isEmpty()) {
continue;
}
if (currentConsecutiveTrueIndices.size() > maxConsecutiveTrueIndices.size()) {
maxConsecutiveTrueIndices = currentConsecutiveTrueIndices;
}
currentConsecutiveTrueIndices = new LinkedList<>();
continue;
}
currentConsecutiveTrueIndices.add(i);
}
if (currentConsecutiveTrueIndices.size() > maxConsecutiveTrueIndices.size()) {
return currentConsecutiveTrueIndices;
}
return maxConsecutiveTrueIndices;
}
private static int findOptimalNumberOfColumns(Map<Integer, List<Integer>> linesWithMatchingGapIndices, Integer numberOfLines) {
return linesWithMatchingGapIndices.entrySet()
.stream()
.max(comparePercentages(numberOfLines))
.filter(entry -> percentageIsAboveThreshold(entry, numberOfLines))
.map(Map.Entry::getKey)
.orElse(1);
}
private List<Rectangle2D> buildColumns(Rectangle2D mainBodyTextFrame, List<Rectangle2D> rectanglesToMerge, int optimalColumnCount) {
if (optimalColumnCount == 1 || rectanglesToMerge.isEmpty()) {
return List.of(mainBodyTextFrame);
}
double maxY = rectanglesToMerge.get(0).getMaxY();
double minY = rectanglesToMerge.get(rectanglesToMerge.size() - 1).getMinY();
List<Rectangle2D> columns = new LinkedList<>();
double width = mainBodyTextFrame.getWidth() / optimalColumnCount;
double height = maxY - minY;
for (int i = 0; i < optimalColumnCount; i++) {
columns.add(new Rectangle2D.Double(mainBodyTextFrame.getMinY() + i * width, minY, width, height));
}
return columns;
}
private Comparator<Map.Entry<Integer, List<Integer>>> comparePercentages(Integer numberOfLines) {
return Comparator.comparingDouble(entry -> calculatePercentage(entry.getValue().size(), numberOfLines));
}
private List<Rectangle2D> getLinesWithMatchingGaps(List<Integer> linesWithMatchingGapIndices, List<List<Rectangle2D>> linesWithGaps) {
return linesWithMatchingGapIndices.stream().map(linesWithGaps::get).flatMap(Collection::stream).toList();
}
private boolean percentageIsAboveThreshold(Map.Entry<Integer, List<Integer>> entry, Integer numberOfLines) {
return calculatePercentage(entry.getValue().size(), numberOfLines) > SPLITTABLE_LINE_PERCENTAGE_THRESHOLD;
}
private double calculatePercentage(Integer numberOfMatchingLines, Integer numberOfLines) {
return ((double) numberOfMatchingLines) / ((double) numberOfLines);
}
private double calculateGapLocation(double pageWidth, int numberOfColumns, int columnIndex) {
return (pageWidth / numberOfColumns) * columnIndex;
}
private Boolean noBlocksIntersectX(List<Rectangle2D> blocksWithGaps, double x) {
return blocksWithGaps.stream().noneMatch(rect -> rect.getMaxX() > x && rect.getMinX() < x);
}
}

View File

@ -0,0 +1,169 @@
package com.knecon.fforesight.service.layoutparser.processor.services;
import java.awt.geom.Rectangle2D;
import java.util.LinkedList;
import java.util.List;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
import lombok.AllArgsConstructor;
import lombok.experimental.UtilityClass;
@UtilityClass
public class GapDetectionService {
private static final double X_GAP_FACTOR = 0.3; // multiplied with average text height, determines the minimum distance of gaps in lines
private static final double Y_GAP_FACTOR = 1;
private static final double NEW_LINE_FACTOR = 0.2;
public static Gaps findGapsInLines(List<TextPositionSequence> sortedTextPositionSequences, Rectangle2D mainBodyTextFrame) {
if (sortedTextPositionSequences.isEmpty()) {
return new Gaps();
}
//assertAllTextPositionsHaveSameDir(textPositionSequences);
final double avgTextPositionHeight = getAvgTextPositionHeight(sortedTextPositionSequences);
XGapsContext xGapContext = XGapsContext.init(mainBodyTextFrame);
YGapsContext yGapContext = YGapsContext.init(mainBodyTextFrame);
var previousTextPosition = sortedTextPositionSequences.get(0);
Rectangle2D rectangle = toRectangle2D(previousTextPosition);
yGapContext.addGapFromTopOfMainBody(rectangle);
xGapContext.addGapFromLeftEdgeOfMainBody(rectangle);
for (TextPositionSequence currentTextPosition : sortedTextPositionSequences.subList(1, sortedTextPositionSequences.size())) {
double yGap = Math.abs(currentTextPosition.getMaxYDirAdj() - previousTextPosition.getMaxYDirAdj());
double xGap = Math.abs(previousTextPosition.getMaxXDirAdj() - currentTextPosition.getMinXDirAdj());
Rectangle2D previousTextPositionBBox = toRectangle2D(previousTextPosition);
Rectangle2D currentTextPositionBBox = toRectangle2D(currentTextPosition);
if (yGap > avgTextPositionHeight * Y_GAP_FACTOR) {
yGapContext.addGap(mainBodyTextFrame.getMinX(), currentTextPositionBBox.getMaxY(), mainBodyTextFrame.getWidth(), yGap);
}
if (yGap > avgTextPositionHeight * NEW_LINE_FACTOR) {
xGapContext.addGapToRightEdgeOfMainBody(previousTextPositionBBox);
xGapContext.gapsInCurrentLine = new LinkedList<>();
xGapContext.gapsPerLine.add(xGapContext.gapsInCurrentLine);
xGapContext.addGapFromLeftEdgeOfMainBody(currentTextPositionBBox);
} else if (xGap <= avgTextPositionHeight * X_GAP_FACTOR) {
addGapToLine(currentTextPositionBBox, previousTextPositionBBox, xGapContext);
}
previousTextPosition = currentTextPosition;
}
xGapContext.addGapToRightEdgeOfMainBody(toRectangle2D(sortedTextPositionSequences.get(sortedTextPositionSequences.size() - 1)));
xGapContext.gapsPerLine.add(xGapContext.gapsInCurrentLine);
return new Gaps(xGapContext.gapsPerLine, yGapContext.gapsPerLine);
}
private static Rectangle2D toRectangle2D(TextPositionSequence textPosition) {
return RectangleTransformations.toRectangle2D(textPosition.getRectangle());
}
private static void addGapToLine(Rectangle2D currentTextPosition, Rectangle2D previousTextPosition, XGapsContext context) {
context.gapsInCurrentLine.add(new Rectangle2D.Double(previousTextPosition.getMaxX(),
previousTextPosition.getMinY(),
currentTextPosition.getMinX() - previousTextPosition.getMaxX(),
(previousTextPosition.getHeight() + currentTextPosition.getHeight()) / 2));
}
private static void assertAllTextPositionsHaveSameDir(List<TextPositionSequence> textPositionSequences) {
assert textPositionSequences.stream().map(TextPositionSequence::getDir).allMatch(a -> a.equals(textPositionSequences.get(0).getDir()));
}
private static double getAvgTextPositionHeight(List<TextPositionSequence> textPositionSequences) {
return textPositionSequences.stream().mapToDouble(TextPositionSequence::getHeight).average().orElseThrow();
}
@AllArgsConstructor
private static class YGapsContext {
List<List<Rectangle2D>> gapsPerLine;
List<Rectangle2D> gapsInCurrentLine;
Rectangle2D mainBodyTextFrame;
public static YGapsContext init(Rectangle2D mainBodyTextFrame) {
List<List<Rectangle2D>> initialLinesWithGaps = new LinkedList<>();
List<Rectangle2D> initialBlocksInLine = new LinkedList<>();
initialLinesWithGaps.add(initialBlocksInLine);
return new YGapsContext(initialLinesWithGaps, initialBlocksInLine, mainBodyTextFrame);
}
public void addGapFromTopOfMainBody(Rectangle2D rectangle) {
gapsInCurrentLine.add(new Rectangle2D.Double(mainBodyTextFrame.getMinX(),
rectangle.getMaxY(),
mainBodyTextFrame.getWidth(),
mainBodyTextFrame.getMaxY() - rectangle.getMaxY()));
}
public void addGap(double x, double y, double w, double h) {
gapsInCurrentLine.add(new Rectangle2D.Double(x, y, w, h));
}
}
@AllArgsConstructor
private static class XGapsContext {
List<List<Rectangle2D>> gapsPerLine;
List<Rectangle2D> gapsInCurrentLine;
Rectangle2D mainBodyTextFrame;
public static XGapsContext init(Rectangle2D mainBodyTextFrame) {
List<List<Rectangle2D>> initialLinesWithGaps = new LinkedList<>();
List<Rectangle2D> initialBlocksInLine = new LinkedList<>();
initialLinesWithGaps.add(initialBlocksInLine);
return new XGapsContext(initialLinesWithGaps, initialBlocksInLine, mainBodyTextFrame);
}
public void addGapToRightEdgeOfMainBody(Rectangle2D textPosition) {
Rectangle2D leftGap = new Rectangle2D.Double(textPosition.getMaxX(),
textPosition.getMinY(),
mainBodyTextFrame.getMaxX() - textPosition.getMaxX(),
textPosition.getHeight());
gapsInCurrentLine.add(leftGap);
}
public void addGapFromLeftEdgeOfMainBody(Rectangle2D textPosition) {
Rectangle2D leftGap = new Rectangle2D.Double(mainBodyTextFrame.getMinX(),
textPosition.getMinY(),
textPosition.getMinX() - mainBodyTextFrame.getMinX(),
textPosition.getHeight());
gapsInCurrentLine.add(leftGap);
}
}
}

View File

@ -0,0 +1,199 @@
package com.knecon.fforesight.service.layoutparser.processor.services;
import java.awt.geom.Rectangle2D;
import java.awt.geom.RectangularShape;
import java.util.LinkedList;
import java.util.List;
import java.util.Queue;
import java.util.stream.Stream;
import com.knecon.fforesight.service.layoutparser.processor.model.GapInformation;
import lombok.AllArgsConstructor;
import lombok.Getter;
import lombok.RequiredArgsConstructor;
import lombok.experimental.UtilityClass;
@UtilityClass
public class GapFindingColumnDetectionService implements ColumnDetectionService {
private static final double GAP_WIDTH_THRESHOLD_FACTOR = 0.01; // multiplied with avg text height
private static final double LINE_COUNT_THRESHOLD_FACTOR = 0.3; // multiplied with average line count per page
public List<Rectangle2D> detectColumns(GapInformation gapInformation, Rectangle2D mainBodyTextFrame) {
if (gapInformation.getXGaps().size() < 2) {
return List.of(mainBodyTextFrame);
}
double avgHeight = gapInformation.getXGaps()
.stream()
.filter(gaps -> !gaps.isEmpty())
.map(gaps -> gaps.get(0))
.mapToDouble(RectangularShape::getHeight)
.average()
.orElseThrow();
ColumnFactory columnFactory = ColumnFactory.init(avgHeight, gapInformation.getXGaps().size());
gapInformation.getXGaps().get(0).stream().map(Column::new).forEach(columnFactory::addToQueue);
List<List<Rectangle2D>> xGaps = gapInformation.getXGaps();
for (var gaps : xGaps.subList(1, xGaps.size())) {
while (columnFactory.hasColumnsToProcess()) {
Column column = columnFactory.getNext();
rememberColumnIfValid(columnFactory, column);
elongateColumnsAndFilterForWidth(column, gaps, columnFactory).forEach(columnFactory::setToStillInProgress);
}
columnFactory.addStillInProgressToQueue();
columnFactory.addGapsToQueue(gaps);
}
return columnFactory.outputColumns.stream()
.filter(column -> columnFactory.outputColumns.stream().filter(column::intersectsX).noneMatch(column1 -> column1.lineCount > column.lineCount))
.filter(column -> )
.map(Column::getRectangle2D)
.toList();
}
private static void rememberColumnIfValid(ColumnFactory columnFactory, Column column) {
if (column.lineCount >= LINE_COUNT_THRESHOLD_FACTOR * (double) columnFactory.lineCount) {
columnFactory.outputColumns.add(column);
}
}
private static Stream<Column> elongateColumnsAndFilterForWidth(Column column, List<Rectangle2D> gaps, ColumnFactory columnFactory) {
return gaps.stream()//
.filter(gap -> column.getIntersectionWidth(gap) > GAP_WIDTH_THRESHOLD_FACTOR * columnFactory.avgHeight)//
.map(column::addNewLineAndShrink);
}
private static Rectangle2D correctRectangle(Rectangle2D rectangle2D) {
double minX = Math.min(rectangle2D.getMinX(), rectangle2D.getMaxX());
double minY = Math.min(rectangle2D.getMinY(), rectangle2D.getMaxY());
double maxX = Math.max(rectangle2D.getMinX(), rectangle2D.getMaxX());
double maxY = Math.max(rectangle2D.getMinY(), rectangle2D.getMaxY());
return new Rectangle2D.Double(minX, minY, maxX - minX, maxY - minY);
}
@Getter
@AllArgsConstructor
private class Column {
Rectangle2D rectangle2D;
int lineCount = 1;
public Column(Rectangle2D rectangle2D) {
this.rectangle2D = correctRectangle(rectangle2D);
}
public boolean intersectsX(Rectangle2D rectangle2D) {
return rectangle2D.getMinX() < this.rectangle2D.getMaxX() && this.rectangle2D.getMinX() < rectangle2D.getMaxX();
}
public boolean intersectsX(Column column) {
return this.intersectsX(column.getRectangle2D());
}
public double getIntersectionWidth(Rectangle2D rectangle2D) {
if (!intersectsX(rectangle2D)) {
return -1;
}
double min_x = Math.max(rectangle2D.getMinX(), this.rectangle2D.getMinX());
double max_x = Math.min(rectangle2D.getMaxX(), this.rectangle2D.getMaxX());
return max_x - min_x;
}
public Column addNewLineAndShrink(Rectangle2D rectangle2D) {
var correctedRectangle = correctRectangle(rectangle2D);
double min_x = Math.max(correctedRectangle.getMinX(), this.rectangle2D.getMinX());
double max_x = Math.min(correctedRectangle.getMaxX(), this.rectangle2D.getMaxX());
double min_y = correctedRectangle.getMinY();
double max_y = this.rectangle2D.getMaxY();
double width = max_x - min_x;
double height = max_y - min_y;
return new Column(new Rectangle2D.Double(min_x, min_y, width, height), lineCount + 1);
}
}
@RequiredArgsConstructor
private class ColumnFactory {
final double avgHeight;
final int lineCount;
List<Column> outputColumns = new LinkedList<>();
Queue<Column> columnQueue = new LinkedList<>();
List<Column> columnsToQueue = new LinkedList<>();
public static ColumnFactory init(double avgHeight, int lineCount) {
return new ColumnFactory(Math.abs(avgHeight), lineCount);
}
public Column getNext() {
return columnQueue.remove();
}
public void addToQueue(Column column) {
columnQueue.add(column);
}
public void addToQueue(Rectangle2D gap) {
columnQueue.add(new Column(gap));
}
private boolean hasColumnsToProcess() {
return columnQueue.peek() != null;
}
public void setToStillInProgress(Column column) {
columnsToQueue.add(column);
}
private void addStillInProgressToQueue() {
for (int i = columnsToQueue.size() - 1; i >= 0; i--) {
columnQueue.add(columnsToQueue.remove(i));
}
}
public void addGapsToQueue(List<Rectangle2D> gaps) {
gaps.forEach(this::addToQueue);
}
}
}

View File

@ -0,0 +1,5 @@
package com.knecon.fforesight.service.layoutparser.processor.services;
public class InvisibleTableDetectionService {
}

View File

@ -0,0 +1,122 @@
package com.knecon.fforesight.service.layoutparser.processor.services;
import java.awt.geom.Rectangle2D;
import java.util.Collections;
import java.util.LinkedList;
import java.util.List;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
import com.knecon.fforesight.service.layoutparser.processor.utils.TextPositionSequenceComparator;
import lombok.AllArgsConstructor;
import lombok.experimental.UtilityClass;
@UtilityClass
public class LineDetectionService {
private static final double X_GAP_FACTOR = 1; // multiplied with average text height, determines the minimum distance of gaps in lines
public static List<List<Rectangle2D>> findTextBlockInLines(List<TextPositionSequence> textPositionSequences) {
if (textPositionSequences.isEmpty()) {
return Collections.emptyList();
}
final double avgTextPositionHeight = getAvgTextPositionHeight(textPositionSequences);
TextBlockContext context = TextBlockContext.init();
List<TextPositionSequence> sortedTextPositionSequence = textPositionSequences.stream().sorted(new TextPositionSequenceComparator()).toList();
var previousTextPosition = sortedTextPositionSequence.get(0);
context.textPositionsToMerge.add(previousTextPosition);
for (TextPositionSequence currentTextPosition : sortedTextPositionSequence.subList(1, sortedTextPositionSequence.size())) {
if (isNewLine(currentTextPosition, previousTextPosition, avgTextPositionHeight) || isSplitByOrientation(currentTextPosition, previousTextPosition)) {
addBlockToLine(context);
startNewLine(currentTextPosition, context);
} else if (isXGap(currentTextPosition, previousTextPosition, avgTextPositionHeight)) {
addBlockToLine(context);
startNewBlock(currentTextPosition, context);
} else {
context.textPositionsToMerge.add(currentTextPosition);
}
previousTextPosition = currentTextPosition;
}
addBlockToLine(context);
return context.textBlocksInLines;
}
private static double getAvgTextPositionHeight(List<TextPositionSequence> textPositionSequences) {
return textPositionSequences.stream().mapToDouble(TextPositionSequence::getHeight).average().orElseThrow();
}
private static boolean isXGap(TextPositionSequence currentTextPosition, TextPositionSequence previousTextPosition, double avgTextPositionHeight) {
return Math.abs(previousTextPosition.getMaxXDirAdj() - currentTextPosition.getMinXDirAdj()) > (avgTextPositionHeight * X_GAP_FACTOR);
}
private static boolean isSplitByOrientation(TextPositionSequence currentTextPosition, TextPositionSequence previousTextPosition) {
return !previousTextPosition.getDir().equals(currentTextPosition.getDir());
}
private static boolean isNewLine(TextPositionSequence currentTextPosition, TextPositionSequence previousTextPosition, double avgTextPositionHeight) {
return Math.abs(previousTextPosition.getMinYDirAdj() - currentTextPosition.getMinYDirAdj()) > avgTextPositionHeight;
}
private static void startNewBlock(TextPositionSequence currentTextPosition, TextBlockContext context) {
context.textPositionsToMerge = new LinkedList<>();
context.textPositionsToMerge.add(currentTextPosition);
}
private static void addBlockToLine(TextBlockContext context) {
context.blocksInCurrentLine.add(textPositionBBox(context.textPositionsToMerge));
}
private static void startNewLine(TextPositionSequence current, TextBlockContext context) {
context.blocksInCurrentLine = new LinkedList<>();
startNewBlock(current, context);
context.textBlocksInLines.add(context.blocksInCurrentLine);
}
private Rectangle2D textPositionBBox(List<TextPositionSequence> textPositionSequences) {
return RectangleTransformations.rectangleBBox(textPositionSequences.stream().map(TextPositionSequence::getRectangle).toList());
}
@AllArgsConstructor
private class TextBlockContext {
List<List<Rectangle2D>> textBlocksInLines;
List<Rectangle2D> blocksInCurrentLine;
List<TextPositionSequence> textPositionsToMerge;
public static TextBlockContext init() {
List<List<Rectangle2D>> initialLinesWithGaps = new LinkedList<>();
List<Rectangle2D> initialBlocksInLine = new LinkedList<>();
initialLinesWithGaps.add(initialBlocksInLine);
return new TextBlockContext(initialLinesWithGaps, initialBlocksInLine, new LinkedList<>());
}
}
}

View File

@ -0,0 +1,5 @@
package com.knecon.fforesight.service.layoutparser.processor.services;
public class MainBodyTextFrameExtractionService {
}

View File

@ -0,0 +1,2 @@
package com.knecon.fforesight.service.layoutparser.processor.services;public class PageInformationService {
}

View File

@ -1,4 +1,4 @@
package com.knecon.fforesight.service.layoutparser.processor.classification.service;
package com.knecon.fforesight.service.layoutparser.processor.services;
import java.util.ArrayList;
import java.util.List;
@ -9,16 +9,20 @@ import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.common.PDRectangle;
import org.springframework.stereotype.Service;
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType;
import com.knecon.fforesight.service.layoutparser.processor.adapter.ImageServiceResponseAdapter;
import com.knecon.fforesight.service.layoutparser.processor.adapter.model.table.TableCells;
import com.knecon.fforesight.service.layoutparser.processor.classification.model.AbstractPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.classification.model.ClassificationDocument;
import com.knecon.fforesight.service.layoutparser.processor.classification.model.ClassificationPage;
import com.knecon.fforesight.service.layoutparser.processor.classification.model.image.ClassifiedImage;
import com.knecon.fforesight.service.layoutparser.processor.classification.model.table.CleanRulings;
import com.knecon.fforesight.service.layoutparser.processor.classification.model.text.TextPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.classification.model.text.TextPositionSequence;
import com.knecon.fforesight.service.layoutparser.processor.classification.parsing.PDFLinesTextStripper;
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationDocument;
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
import com.knecon.fforesight.service.layoutparser.processor.model.image.ClassifiedImage;
import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
import com.knecon.fforesight.service.layoutparser.processor.parsing.PDFLinesTextStripper;
import com.knecon.fforesight.service.layoutparser.processor.services.blockification.DocuMineBlockificationService;
import com.knecon.fforesight.service.layoutparser.processor.services.blockification.RedactManagerBlockificationService;
import com.knecon.fforesight.service.layoutparser.processor.services.blockification.TaasBlockificationService;
import lombok.RequiredArgsConstructor;
import lombok.SneakyThrows;
@ -31,11 +35,16 @@ public class PdfParsingService {
private final RulingCleaningService rulingCleaningService;
private final TableExtractionService tableExtractionService;
private final BlockificationService blockificationService;
private final ImageServiceResponseAdapter imageServiceResponseAdapter;
private final TaasBlockificationService taasBlockificationService;
private final DocuMineBlockificationService docuMineBlockificationService;
private final RedactManagerBlockificationService redactManagerBlockificationService;
public ClassificationDocument parseDocument(PDDocument originDocument, Map<Integer, List<TableCells>> pdfTableCells, Map<Integer, List<ClassifiedImage>> pdfImages) {
public ClassificationDocument parseDocument(LayoutParsingType layoutParsingType,
PDDocument originDocument,
Map<Integer, List<TableCells>> pdfTableCells,
Map<Integer, List<ClassifiedImage>> pdfImages) {
ClassificationDocument document = new ClassificationDocument();
List<ClassificationPage> classificationPages = new ArrayList<>();
@ -44,7 +53,7 @@ public class PdfParsingService {
long pageCount = originDocument.getNumberOfPages();
for (int pageNumber = 1; pageNumber <= pageCount; pageNumber++) {
parsePage(pdfImages, originDocument, pdfTableCells, document, classificationPages, pageNumber);
parsePage(layoutParsingType, pdfImages, originDocument, pdfTableCells, document, classificationPages, pageNumber);
}
document.setPages(classificationPages);
@ -54,7 +63,8 @@ public class PdfParsingService {
@SneakyThrows
private void parsePage(Map<Integer, List<ClassifiedImage>> pdfImages,
private void parsePage(LayoutParsingType layoutParsingType,
Map<Integer, List<ClassifiedImage>> pdfImages,
PDDocument pdDocument,
Map<Integer, List<TableCells>> pdfTableCells,
ClassificationDocument document,
@ -79,7 +89,12 @@ public class PdfParsingService {
stripper.getRulings(),
stripper.getMinCharWidth(),
stripper.getMaxCharHeight());
ClassificationPage classificationPage = blockificationService.blockify(stripper.getTextPositionSequences(), cleanRulings.getHorizontal(), cleanRulings.getVertical());
ClassificationPage classificationPage = switch (layoutParsingType) {
case REDACT_MANAGER -> redactManagerBlockificationService.blockify(stripper.getTextPositionSequences(), cleanRulings.getHorizontal(), cleanRulings.getVertical());
case TAAS -> taasBlockificationService.blockify(stripper.getTextPositionSequences(), cleanRulings.getHorizontal(), cleanRulings.getVertical());
case DOCUMINE -> docuMineBlockificationService.blockify(stripper.getTextPositionSequences(), cleanRulings.getHorizontal(), cleanRulings.getVertical());
};
classificationPage.setRotation(rotation);
classificationPage.setLandscape(isLandscape);

View File

@ -1,95 +0,0 @@
package com.knecon.fforesight.service.layoutparser.processor.services;
import static java.lang.String.format;
import java.awt.geom.Area;
import java.awt.geom.Rectangle2D;
import java.util.Arrays;
import java.util.List;
import java.util.Set;
import java.util.function.BiConsumer;
import java.util.function.BinaryOperator;
import java.util.function.Function;
import java.util.function.Supplier;
import java.util.stream.Collector;
import com.knecon.fforesight.service.layoutparser.processor.graph.textblock.AtomicTextBlock;
import lombok.experimental.UtilityClass;
@UtilityClass
public class RectangleTransformations {
public static Rectangle2D pad(Rectangle2D rectangle2D, int deltaX, int deltaY) {
return new Rectangle2D.Double(rectangle2D.getMinX() - deltaX, rectangle2D.getMinY() - deltaY, rectangle2D.getWidth() + 2 * deltaX, rectangle2D.getHeight() + 2 * deltaY);
}
public static Rectangle2D bBoxUnionAtomicTextBlock(List<AtomicTextBlock> atomicTextBlocks) {
return atomicTextBlocks.stream().flatMap(atomicTextBlock -> atomicTextBlock.getPositions().stream()).collect(new Rectangle2DUnion());
}
public static Rectangle2D rectangleUnion(List<Rectangle2D> rectangle2DList) {
return rectangle2DList.stream().collect(new Rectangle2DUnion());
}
public static String toString(Rectangle2D rectangle2D) {
return format("%f,%f,%f,%f", rectangle2D.getX(), rectangle2D.getY(), rectangle2D.getWidth(), rectangle2D.getHeight());
}
public static Rectangle2D parseRectangle2D(String bBox) {
List<Float> floats = Arrays.stream(bBox.split(",")).map(Float::parseFloat).toList();
return new Rectangle2D.Float(floats.get(0), floats.get(1), floats.get(2), floats.get(3));
}
private static class Rectangle2DUnion implements Collector<Rectangle2D, Area, Rectangle2D> {
@Override
public Supplier<Area> supplier() {
return Area::new;
}
@Override
public BiConsumer<Area, Rectangle2D> accumulator() {
return (area, rectangle2D) -> area.add(new Area(rectangle2D));
}
@Override
public BinaryOperator<Area> combiner() {
return (area1, area2) -> {
area1.add(area2);
return area1;
};
}
@Override
public Function<Area, Rectangle2D> finisher() {
return Area::getBounds2D;
}
@Override
public Set<Characteristics> characteristics() {
return Set.of(Characteristics.CONCURRENT, Characteristics.UNORDERED);
}
}
}

View File

@ -1,4 +1,4 @@
package com.knecon.fforesight.service.layoutparser.processor.classification.service;
package com.knecon.fforesight.service.layoutparser.processor.services;
import java.awt.geom.Line2D;
import java.awt.geom.Point2D;
@ -13,9 +13,9 @@ import java.util.Map;
import org.springframework.stereotype.Service;
import com.knecon.fforesight.service.layoutparser.processor.adapter.model.table.TableCells;
import com.knecon.fforesight.service.layoutparser.processor.classification.model.table.CleanRulings;
import com.knecon.fforesight.service.layoutparser.processor.classification.model.table.Ruling;
import com.knecon.fforesight.service.layoutparser.processor.classification.utils.DoubleComparisons;
import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings;
import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling;
currentCons> com.knecon.fforesight.service.layoutparser.processor.utils.DoubleComparisons;
import lombok.RequiredArgsConstructor;
import lombok.extern.slf4j.Slf4j;

View File

@ -0,0 +1,146 @@
package com.knecon.fforesight.service.layoutparser.processor.services;
import java.awt.geom.Rectangle2D;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Objects;
import java.util.Set;
import java.util.function.BiConsumer;
import java.util.function.BinaryOperator;
import java.util.function.Function;
import java.util.function.Supplier;
import java.util.stream.Collector;
import java.util.stream.Stream;
import org.springframework.stereotype.Service;
import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.Point;
import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.section.CellRectangle;
import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.section.SectionGrid;
import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.section.SectionRectangle;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeType;
import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.Document;
import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.Page;
import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.SemanticNode;
import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.Table;
import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.TableCell;
import lombok.RequiredArgsConstructor;
@Service
@RequiredArgsConstructor
public class SectionGridCreatorService {
public SectionGrid createSectionGrid(Document document) {
Map<Integer, List<SectionRectangle>> sectionBBox = document.streamAllSubNodesOfType(NodeType.SECTION).map(SemanticNode::getBBox).collect(new SectionGridCollector());
Map<Integer, List<SectionRectangle>> paragraphBBox = document.streamAllSubNodesOfType(NodeType.PARAGRAPH).map(SemanticNode::getBBox).collect(new SectionGridCollector());
Map<Integer, List<SectionRectangle>> headlineBBox = document.streamAllSubNodesOfType(NodeType.HEADLINE).map(SemanticNode::getBBox).collect(new SectionGridCollector());
Map<Integer, List<SectionRectangle>> tableBBox = document.streamAllSubNodesOfType(NodeType.TABLE).map(node -> (Table) node).collect(new TableGridCollector());
var sectionGrid = new SectionGrid();
sectionGrid.setRectanglesPerPage(mergeMapsByConcatenatingLists(//
mergeMapsByConcatenatingLists(paragraphBBox, headlineBBox), //
mergeMapsByConcatenatingLists(sectionBBox, tableBBox)));
return sectionGrid;
}
private static abstract class GridCollector<T> implements Collector<T, Map<Integer, List<SectionRectangle>>, Map<Integer, List<SectionRectangle>>> {
@Override
public Supplier<Map<Integer, List<SectionRectangle>>> supplier() {
return HashMap::new;
}
@Override
public Function<Map<Integer, List<SectionRectangle>>, Map<Integer, List<SectionRectangle>>> finisher() {
return Function.identity();
}
@Override
public BinaryOperator<Map<Integer, List<SectionRectangle>>> combiner() {
return SectionGridCreatorService::mergeMapsByConcatenatingLists;
}
@Override
public Set<Characteristics> characteristics() {
return Set.of(Characteristics.IDENTITY_FINISH, Characteristics.CONCURRENT, Characteristics.UNORDERED);
}
}
private static class TableGridCollector extends GridCollector<Table> {
@Override
public BiConsumer<Map<Integer, List<SectionRectangle>>, Table> accumulator() {
return (map, table) -> table.getPages()
.forEach(page -> map.merge(page.getNumber(), List.of(toSectionRectangle(table, page, table.getPages().size())), SectionGridCreatorService::concatLists));
}
private static SectionRectangle toSectionRectangle(Table table, Page page, int numberOfParts) {
Rectangle2D rect = table.getBBox().get(page);
List<CellRectangle> tableCellRectangles = table.streamTableCells()
.map(TableCell::getBBox)
.map(map -> map.get(page))
.filter(Objects::nonNull)
.map(rectangle2D -> new CellRectangle(new Point((float) rectangle2D.getX(), (float) rectangle2D.getY()),
(float) rectangle2D.getWidth(),
(float) rectangle2D.getHeight()))
.toList();
return new SectionRectangle(new Point((float) rect.getX(), (float) rect.getY()),
(float) rect.getWidth(),
(float) rect.getHeight(),
1,
numberOfParts,
tableCellRectangles);
}
}
private static class SectionGridCollector extends GridCollector<Map<Page, Rectangle2D>> {
@Override
public BiConsumer<Map<Integer, List<SectionRectangle>>, Map<Page, Rectangle2D>> accumulator() {
return (mapToKeep, mapToMerge) -> mapToMerge.forEach((page, rectangle) -> mapToKeep.merge(page.getNumber(),
List.of(toSectionRectangle(rectangle, mapToMerge.values().size())),
SectionGridCreatorService::concatLists));
}
private static SectionRectangle toSectionRectangle(Rectangle2D rect, int numberOfParts) {
return new SectionRectangle(new Point((float) rect.getX(), (float) rect.getY()), (float) rect.getWidth(), (float) rect.getHeight(), 1, numberOfParts, null);
}
}
private static Map<Integer, List<SectionRectangle>> mergeMapsByConcatenatingLists(Map<Integer, List<SectionRectangle>> mapToKeep,
Map<Integer, List<SectionRectangle>> mapToMerge) {
mapToMerge.forEach((page, rectangle) -> mapToKeep.merge(page, rectangle, SectionGridCreatorService::concatLists));
return mapToKeep;
}
private static List<SectionRectangle> concatLists(List<SectionRectangle> l1, List<SectionRectangle> l2) {
return Stream.concat(l1.stream(), l2.stream()).toList();
}
}

View File

@ -1,4 +1,4 @@
package com.knecon.fforesight.service.layoutparser.processor.classification.service;
package com.knecon.fforesight.service.layoutparser.processor.services;
import java.util.ArrayList;
import java.util.Collections;
@ -9,18 +9,18 @@ import java.util.stream.Collectors;
import org.springframework.stereotype.Service;
import com.knecon.fforesight.service.layoutparser.processor.classification.model.AbstractPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.classification.model.ClassificationDocument;
import com.knecon.fforesight.service.layoutparser.processor.classification.model.ClassificationFooter;
import com.knecon.fforesight.service.layoutparser.processor.classification.model.ClassificationHeader;
import com.knecon.fforesight.service.layoutparser.processor.classification.model.ClassificationPage;
import com.knecon.fforesight.service.layoutparser.processor.classification.model.ClassificationSection;
import com.knecon.fforesight.service.layoutparser.processor.classification.model.PageBlockType;
import com.knecon.fforesight.service.layoutparser.processor.classification.model.image.ClassifiedImage;
import com.knecon.fforesight.service.layoutparser.processor.classification.model.table.Cell;
import com.knecon.fforesight.service.layoutparser.processor.classification.model.table.TablePageBlock;
import com.knecon.fforesight.service.layoutparser.processor.classification.model.text.TextPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.classification.model.text.UnclassifiedText;
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationDocument;
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationFooter;
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationHeader;
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationSection;
import com.knecon.fforesight.service.layoutparser.processor.model.PageBlockType;
import com.knecon.fforesight.service.layoutparser.processor.model.image.ClassifiedImage;
import com.knecon.fforesight.service.layoutparser.processor.model.table.Cell;
import com.knecon.fforesight.service.layoutparser.processor.model.table.TablePageBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.text.UnclassifiedText;
import lombok.extern.slf4j.Slf4j;

View File

@ -1,4 +1,4 @@
package com.knecon.fforesight.service.layoutparser.processor.classification.service;
package com.knecon.fforesight.service.layoutparser.processor.services;
import java.awt.geom.Point2D;
import java.util.ArrayList;
@ -12,15 +12,15 @@ import java.util.Set;
import org.springframework.stereotype.Service;
import com.knecon.fforesight.service.layoutparser.processor.classification.model.AbstractPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.classification.model.ClassificationPage;
import com.knecon.fforesight.service.layoutparser.processor.classification.model.table.Cell;
import com.knecon.fforesight.service.layoutparser.processor.classification.model.table.CleanRulings;
import com.knecon.fforesight.service.layoutparser.processor.classification.model.table.Rectangle;
import com.knecon.fforesight.service.layoutparser.processor.classification.model.table.Ruling;
import com.knecon.fforesight.service.layoutparser.processor.classification.model.table.TablePageBlock;
import com.knecon.fforesight.service.layoutparser.processor.classification.model.text.TextPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.classification.utils.DoubleComparisons;
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
import com.knecon.fforesight.service.layoutparser.processor.model.table.Cell;
import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings;
import com.knecon.fforesight.service.layoutparser.processor.model.table.Rectangle;
import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling;
import com.knecon.fforesight.service.layoutparser.processor.model.table.TablePageBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.utils.DoubleComparisons;
@Service
public class TableExtractionService {

View File

@ -0,0 +1,75 @@
package com.knecon.fforesight.service.layoutparser.processor.services;
import java.io.IOException;
import java.io.InputStream;
import java.util.Collection;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.stream.Collectors;
import org.apache.pdfbox.Loader;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
import org.springframework.core.io.ClassPathResource;
import com.knecon.fforesight.service.layoutparser.processor.model.TextPositionsWithPage;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
import com.knecon.fforesight.service.layoutparser.processor.parsing.PDFLinesTextStripper;
import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations;
import com.knecon.fforesight.service.layoutparser.processor.utils.TextPositionSequenceComparator;
import lombok.experimental.UtilityClass;
@UtilityClass
public class TextPositionSequenceExtractionService {
public List<TextPositionsWithPage> getSortedTextPositionsWithPages(String filename) throws IOException {
List<TextPositionsWithPage> textPositionSequencesPerPage = new LinkedList<>();
try (InputStream inputStream = new ClassPathResource(filename).getInputStream()) {
PDDocument pdDocument = Loader.loadPDF(inputStream);
for (int pageNumber = 1; pageNumber < pdDocument.getNumberOfPages() + 1; pageNumber++) {
PDFLinesTextStripper stripper = new PDFLinesTextStripper();
PDPage pdPage = pdDocument.getPage(pageNumber - 1);
stripper.setPageNumber(pageNumber);
stripper.setStartPage(pageNumber);
stripper.setEndPage(pageNumber);
stripper.setPdpage(pdPage);
stripper.getText(pdDocument);
// var sortedTextPositionSequences = stripper.getTextPositionSequences();
Map<Float, List<TextPositionSequence>> sortedTextPositionSequencesPerDir = stripper.getTextPositionSequences()
.stream()
.sorted(new TextPositionSequenceComparator())
.collect(Collectors.groupingBy(textPositionSequence -> textPositionSequence.getDir().getDegrees()));
var sortedTextPositionSequences = sortByDirAccordingToPageRotation(sortedTextPositionSequencesPerDir, pdPage.getRotation());
textPositionSequencesPerPage.add(new TextPositionsWithPage(sortedTextPositionSequences, RectangleTransformations.toRectangle2D(pdPage.getCropBox())));
}
pdDocument.close();
}
return textPositionSequencesPerPage;
}
public List<TextPositionSequence> sortByDirAccordingToPageRotation(Map<Float, List<TextPositionSequence>> sortedTextPositionSequencesPerDir, int rotation) {
LinkedList<Float> sortedKeys = new LinkedList<>(sortedTextPositionSequencesPerDir.keySet().stream().sorted().toList());
for (int i = 0; i < sortedKeys.size(); i++) {
if (sortedKeys.get(i) < rotation) {
Float keyToSwap = sortedKeys.remove(i);
sortedKeys.addLast(keyToSwap);
}
}
return sortedKeys.stream().map(sortedTextPositionSequencesPerDir::get).flatMap(Collection::stream).toList();
}
}

View File

@ -0,0 +1,229 @@
package com.knecon.fforesight.service.layoutparser.processor.services.blockification;
import static java.util.stream.Collectors.toSet;
import java.util.ArrayList;
import java.util.Comparator;
import java.util.List;
import org.springframework.stereotype.Service;
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
import com.knecon.fforesight.service.layoutparser.processor.model.FloatFrequencyCounter;
import com.knecon.fforesight.service.layoutparser.processor.model.Orientation;
import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling;
import com.knecon.fforesight.service.layoutparser.processor.model.text.StringFrequencyCounter;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
import com.knecon.fforesight.service.layoutparser.processor.utils.RulingTextDirAdjustUtil;
@Service
public class DocuMineBlockificationService {
static final float THRESHOLD = 1f;
/**
* This method is building blocks by expanding the minX/maxX and minY/maxY value on each word that is not split by the conditions.
* This method must use text direction adjusted postions (DirAdj). Where {0,0} is on the upper left. Never try to change this!
* Rulings (Table lines) must be adjusted to the text directions as well, when checking if a block is split by a ruling.
*
* @param textPositions The words of a page.
* @param horizontalRulingLines Horizontal table lines.
* @param verticalRulingLines Vertical table lines.
* @return Page object that contains the Textblock and text statistics.
*/
public ClassificationPage blockify(List<TextPositionSequence> textPositions, List<Ruling> horizontalRulingLines, List<Ruling> verticalRulingLines) {
List<TextPositionSequence> chunkWords = new ArrayList<>();
List<AbstractPageBlock> chunkBlockList1 = new ArrayList<>();
float minX = 1000, maxX = 0, minY = 1000, maxY = 0;
TextPositionSequence prev = null;
boolean wasSplitted = false;
Float splitX1 = null;
for (TextPositionSequence word : textPositions) {
boolean lineSeparation = prev != null && word.getMinYDirAdj() - maxY > Math.min(word.getHeight(), prev.getHeight()) * 1.25;
boolean startFromTop = prev != null && word.getMinYDirAdj() < prev.getMinYDirAdj() - prev.getTextHeight();
boolean splitByX = prev != null && maxX + 50 < word.getMinXDirAdj() && prev.getMinYDirAdj() == word.getMinYDirAdj();
boolean negativeXGap = prev != null && word.getMinXDirAdj() - minX < -5;
boolean newLineAfterSplit = prev != null && word.getMinYDirAdj() != prev.getMinYDirAdj() && wasSplitted && splitX1 != word.getMinXDirAdj();
boolean isSplitByRuling = isSplitByRuling(minX, minY, maxX, maxY, word, horizontalRulingLines, verticalRulingLines);
boolean splitByDir = prev != null && !prev.getDir().equals(word.getDir());
boolean splitByOtherFontAndOtherY = prev != null && prev.getMaxYDirAdj() != word.getMaxYDirAdj() && (word.getFontStyle().contains("bold") && !prev.getFontStyle()
.contains("bold") || prev.getFontStyle().contains("bold") && !word.getFontStyle().contains("bold"));
if (prev != null && (lineSeparation || startFromTop || splitByDir || isSplitByRuling || splitByOtherFontAndOtherY || negativeXGap)) {
Orientation prevOrientation = null;
if (!chunkBlockList1.isEmpty()) {
prevOrientation = chunkBlockList1.get(chunkBlockList1.size() - 1).getOrientation();
}
TextPageBlock cb1 = buildTextBlock(chunkWords);
chunkBlockList1.add(cb1);
chunkWords = new ArrayList<>();
if (splitByX && !isSplitByRuling) {
wasSplitted = true;
cb1.setOrientation(Orientation.LEFT);
splitX1 = word.getMinXDirAdj();
} else if (newLineAfterSplit && !isSplitByRuling) {
wasSplitted = false;
cb1.setOrientation(Orientation.RIGHT);
splitX1 = null;
} else if (prevOrientation != null && prevOrientation.equals(Orientation.RIGHT) && (lineSeparation || !startFromTop || !splitByX || !newLineAfterSplit || !isSplitByRuling)) {
cb1.setOrientation(Orientation.LEFT);
}
minX = 1000;
maxX = 0;
minY = 1000;
maxY = 0;
prev = null;
}
chunkWords.add(word);
prev = word;
if (word.getMinXDirAdj() < minX) {
minX = word.getMinXDirAdj();
}
if (word.getMaxXDirAdj() > maxX) {
maxX = word.getMaxXDirAdj();
}
if (word.getMinYDirAdj() < minY) {
minY = word.getMinYDirAdj();
}
if (word.getMaxYDirAdj() > maxY) {
maxY = word.getMaxYDirAdj();
}
}
TextPageBlock cb1 = buildTextBlock(chunkWords);
if (cb1 != null) {
chunkBlockList1.add(cb1);
}
return new ClassificationPage(chunkBlockList1);
}
private boolean equalsWithThreshold(float f1, float f2) {
return Math.abs(f1 - f2) < THRESHOLD;
}
private TextPageBlock buildTextBlock(List<TextPositionSequence> wordBlockList) {
TextPageBlock textBlock = null;
FloatFrequencyCounter lineHeightFrequencyCounter = new FloatFrequencyCounter();
FloatFrequencyCounter fontSizeFrequencyCounter = new FloatFrequencyCounter();
FloatFrequencyCounter spaceFrequencyCounter = new FloatFrequencyCounter();
StringFrequencyCounter fontFrequencyCounter = new StringFrequencyCounter();
StringFrequencyCounter styleFrequencyCounter = new StringFrequencyCounter();
for (TextPositionSequence wordBlock : wordBlockList) {
lineHeightFrequencyCounter.add(wordBlock.getTextHeight());
fontSizeFrequencyCounter.add(wordBlock.getFontSize());
spaceFrequencyCounter.add(wordBlock.getSpaceWidth());
fontFrequencyCounter.add(wordBlock.getFont());
styleFrequencyCounter.add(wordBlock.getFontStyle());
if (textBlock == null) {
textBlock = new TextPageBlock(wordBlock.getMinXDirAdj(),
wordBlock.getMaxXDirAdj(),
wordBlock.getMinYDirAdj(),
wordBlock.getMaxYDirAdj(),
wordBlockList,
wordBlock.getRotation());
} else {
TextPageBlock spatialEntity = textBlock.union(wordBlock);
textBlock.resize(spatialEntity.getMinX(), spatialEntity.getMinY(), spatialEntity.getWidth(), spatialEntity.getHeight());
}
}
if (textBlock != null) {
textBlock.setMostPopularWordFont(fontFrequencyCounter.getMostPopular());
textBlock.setMostPopularWordStyle(styleFrequencyCounter.getMostPopular());
textBlock.setMostPopularWordFontSize(fontSizeFrequencyCounter.getMostPopular());
textBlock.setMostPopularWordHeight(lineHeightFrequencyCounter.getMostPopular());
textBlock.setMostPopularWordSpaceWidth(spaceFrequencyCounter.getMostPopular());
textBlock.setHighestFontSize(fontSizeFrequencyCounter.getHighest());
}
if (textBlock != null && textBlock.getSequences() != null && textBlock.getSequences().stream().map(t -> round(t.getMinYDirAdj(), 3)).collect(toSet()).size() == 1) {
textBlock.getSequences().sort(Comparator.comparing(TextPositionSequence::getMinXDirAdj));
}
return textBlock;
}
private boolean isSplitByRuling(float minX,
float minY,
float maxX,
float maxY,
TextPositionSequence word,
List<Ruling> horizontalRulingLines,
List<Ruling> verticalRulingLines) {
return isSplitByRuling(maxX,
minY,
word.getMinXDirAdj(),
word.getMinYDirAdj(),
verticalRulingLines,
word.getDir().getDegrees(),
word.getPageWidth(),
word.getPageHeight()) //
|| isSplitByRuling(minX,
minY,
word.getMinXDirAdj(),
word.getMaxYDirAdj(),
horizontalRulingLines,
word.getDir().getDegrees(),
word.getPageWidth(),
word.getPageHeight()) //
|| isSplitByRuling(maxX,
minY,
word.getMinXDirAdj(),
word.getMinYDirAdj(),
horizontalRulingLines,
word.getDir().getDegrees(),
word.getPageWidth(),
word.getPageHeight()) //
|| isSplitByRuling(minX,
minY,
word.getMinXDirAdj(),
word.getMaxYDirAdj(),
verticalRulingLines,
word.getDir().getDegrees(),
word.getPageWidth(),
word.getPageHeight()); //
}
private boolean isSplitByRuling(float previousX2, float previousY1, float currentX1, float currentY1, List<Ruling> rulingLines, float dir, float pageWidth, float pageHeight) {
for (Ruling ruling : rulingLines) {
var line = RulingTextDirAdjustUtil.convertToDirAdj(ruling, dir, pageWidth, pageHeight);
if (line.intersectsLine(previousX2, previousY1, currentX1, currentY1)) {
return true;
}
}
return false;
}
private double round(float value, int decimalPoints) {
var d = Math.pow(10, decimalPoints);
return Math.round(value * d) / d;
}
}

View File

@ -0,0 +1,278 @@
package com.knecon.fforesight.service.layoutparser.processor.services.blockification;
import static java.util.stream.Collectors.toSet;
import java.util.ArrayList;
import java.util.Comparator;
import java.util.Iterator;
import java.util.List;
import org.springframework.stereotype.Service;
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
import com.knecon.fforesight.service.layoutparser.processor.model.FloatFrequencyCounter;
import com.knecon.fforesight.service.layoutparser.processor.model.Orientation;
import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling;
import com.knecon.fforesight.service.layoutparser.processor.model.text.StringFrequencyCounter;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
import com.knecon.fforesight.service.layoutparser.processor.utils.RulingTextDirAdjustUtil;
@SuppressWarnings("all")
@Service
public class RedactManagerBlockificationService {
static final float THRESHOLD = 1f;
/**
* This method is building blocks by expanding the minX/maxX and minY/maxY value on each word that is not split by the conditions.
* This method must use text direction adjusted postions (DirAdj). Where {0,0} is on the upper left. Never try to change this!
* Rulings (TablePageBlock lines) must be adjusted to the text directions as well, when checking if a block is split by a ruling.
*
* @param textPositions The words of a page.
* @param horizontalRulingLines Horizontal table lines.
* @param verticalRulingLines Vertical table lines.
* @return Page object that contains the Textblock and text statistics.
*/
public ClassificationPage blockify(List<TextPositionSequence> textPositions, List<Ruling> horizontalRulingLines, List<Ruling> verticalRulingLines) {
int indexOnPage = 0;
List<TextPositionSequence> chunkWords = new ArrayList<>();
List<AbstractPageBlock> chunkBlockList = new ArrayList<>();
float minX = 1000, maxX = 0, minY = 1000, maxY = 0;
TextPositionSequence prev = null;
boolean wasSplitted = false;
Float splitX1 = null;
for (TextPositionSequence word : textPositions) {
boolean lineSeparation = word.getMinYDirAdj() - maxY > word.getHeight() * 1.25;
boolean startFromTop = prev != null && word.getMinYDirAdj() < prev.getMinYDirAdj() - prev.getTextHeight();
boolean splitByX = prev != null && maxX + 50 < word.getMinXDirAdj() && prev.getMinYDirAdj() == word.getMinYDirAdj();
boolean xIsBeforeFirstX = prev != null && word.getMinXDirAdj() < minX;
boolean newLineAfterSplit = prev != null && word.getMinYDirAdj() != prev.getMinYDirAdj() && wasSplitted && splitX1 != word.getMinXDirAdj();
boolean isSplitByRuling = isSplitByRuling(minX, minY, maxX, maxY, word, horizontalRulingLines, verticalRulingLines);
boolean splitByDir = prev != null && !prev.getDir().equals(word.getDir());
if (prev != null && (lineSeparation || startFromTop || splitByX || splitByDir || isSplitByRuling)) {
Orientation prevOrientation = null;
if (!chunkBlockList.isEmpty()) {
prevOrientation = chunkBlockList.get(chunkBlockList.size() - 1).getOrientation();
}
TextPageBlock cb1 = buildTextBlock(chunkWords, indexOnPage);
indexOnPage++;
chunkBlockList.add(cb1);
chunkWords = new ArrayList<>();
if (splitByX && !isSplitByRuling) {
wasSplitted = true;
cb1.setOrientation(Orientation.LEFT);
splitX1 = word.getMinXDirAdj();
} else if (newLineAfterSplit && !isSplitByRuling) {
wasSplitted = false;
cb1.setOrientation(Orientation.RIGHT);
splitX1 = null;
} else if (prevOrientation != null && prevOrientation.equals(Orientation.RIGHT) && (lineSeparation || !startFromTop || !splitByX || !newLineAfterSplit || !isSplitByRuling)) {
cb1.setOrientation(Orientation.LEFT);
}
minX = 1000;
maxX = 0;
minY = 1000;
maxY = 0;
prev = null;
}
chunkWords.add(word);
prev = word;
if (word.getMinXDirAdj() < minX) {
minX = word.getMinXDirAdj();
}
if (word.getMaxXDirAdj() > maxX) {
maxX = word.getMaxXDirAdj();
}
if (word.getMinYDirAdj() < minY) {
minY = word.getMinYDirAdj();
}
if (word.getMaxYDirAdj() > maxY) {
maxY = word.getMaxYDirAdj();
}
}
TextPageBlock cb1 = buildTextBlock(chunkWords, indexOnPage);
if (cb1 != null) {
chunkBlockList.add(cb1);
}
Iterator<AbstractPageBlock> itty = chunkBlockList.iterator();
TextPageBlock previousLeft = null;
TextPageBlock previousRight = null;
while (itty.hasNext()) {
TextPageBlock block = (TextPageBlock) itty.next();
if (previousLeft != null && block.getOrientation().equals(Orientation.LEFT)) {
if (previousLeft.getMinY() > block.getMinY() && block.getMaxY() + block.getMostPopularWordHeight() > previousLeft.getMinY()) {
previousLeft.add(block);
itty.remove();
continue;
}
}
if (previousRight != null && block.getOrientation().equals(Orientation.RIGHT)) {
if (previousRight.getMinY() > block.getMinY() && block.getMaxY() + block.getMostPopularWordHeight() > previousRight.getMinY()) {
previousRight.add(block);
itty.remove();
continue;
}
}
if (block.getOrientation().equals(Orientation.LEFT)) {
previousLeft = block;
} else if (block.getOrientation().equals(Orientation.RIGHT)) {
previousRight = block;
}
}
itty = chunkBlockList.iterator();
TextPageBlock previous = null;
while (itty.hasNext()) {
TextPageBlock block = (TextPageBlock) itty.next();
if (previous != null && previous.getOrientation().equals(Orientation.LEFT) && block.getOrientation().equals(Orientation.LEFT) && equalsWithThreshold(block.getMaxY(),
previous.getMaxY()) || previous != null && previous.getOrientation().equals(Orientation.LEFT) && block.getOrientation()
.equals(Orientation.RIGHT) && equalsWithThreshold(block.getMaxY(), previous.getMaxY())) {
previous.add(block);
itty.remove();
continue;
}
previous = block;
}
return new ClassificationPage(chunkBlockList);
}
private boolean equalsWithThreshold(float f1, float f2) {
return Math.abs(f1 - f2) < THRESHOLD;
}
private TextPageBlock buildTextBlock(List<TextPositionSequence> wordBlockList, int indexOnPage) {
TextPageBlock textBlock = null;
FloatFrequencyCounter lineHeightFrequencyCounter = new FloatFrequencyCounter();
FloatFrequencyCounter fontSizeFrequencyCounter = new FloatFrequencyCounter();
FloatFrequencyCounter spaceFrequencyCounter = new FloatFrequencyCounter();
StringFrequencyCounter fontFrequencyCounter = new StringFrequencyCounter();
StringFrequencyCounter styleFrequencyCounter = new StringFrequencyCounter();
for (TextPositionSequence wordBlock : wordBlockList) {
lineHeightFrequencyCounter.add(wordBlock.getTextHeight());
fontSizeFrequencyCounter.add(wordBlock.getFontSize());
spaceFrequencyCounter.add(wordBlock.getSpaceWidth());
fontFrequencyCounter.add(wordBlock.getFont());
styleFrequencyCounter.add(wordBlock.getFontStyle());
if (textBlock == null) {
textBlock = new TextPageBlock(wordBlock.getMinXDirAdj(),
wordBlock.getMaxXDirAdj(),
wordBlock.getMinYDirAdj(),
wordBlock.getMaxYDirAdj(),
wordBlockList,
wordBlock.getRotation());
} else {
TextPageBlock spatialEntity = textBlock.union(wordBlock);
textBlock.resize(spatialEntity.getMinX(), spatialEntity.getMinY(), spatialEntity.getWidth(), spatialEntity.getHeight());
}
}
if (textBlock != null) {
textBlock.setMostPopularWordFont(fontFrequencyCounter.getMostPopular());
textBlock.setMostPopularWordStyle(styleFrequencyCounter.getMostPopular());
textBlock.setMostPopularWordFontSize(fontSizeFrequencyCounter.getMostPopular());
textBlock.setMostPopularWordHeight(lineHeightFrequencyCounter.getMostPopular());
textBlock.setMostPopularWordSpaceWidth(spaceFrequencyCounter.getMostPopular());
textBlock.setHighestFontSize(fontSizeFrequencyCounter.getHighest());
}
if (textBlock != null && textBlock.getSequences() != null && textBlock.getSequences().stream().map(t -> round(t.getMinYDirAdj(), 3)).collect(toSet()).size() == 1) {
textBlock.getSequences().sort(Comparator.comparing(TextPositionSequence::getMinXDirAdj));
}
return textBlock;
}
private boolean isSplitByRuling(float minX,
float minY,
float maxX,
float maxY,
TextPositionSequence word,
List<Ruling> horizontalRulingLines,
List<Ruling> verticalRulingLines) {
return isSplitByRuling(maxX,
minY,
word.getMinXDirAdj(),
word.getMinYDirAdj(),
verticalRulingLines,
word.getDir().getDegrees(),
word.getPageWidth(),
word.getPageHeight()) //
|| isSplitByRuling(minX,
minY,
word.getMinXDirAdj(),
word.getMaxYDirAdj(),
horizontalRulingLines,
word.getDir().getDegrees(),
word.getPageWidth(),
word.getPageHeight()) //
|| isSplitByRuling(maxX,
minY,
word.getMinXDirAdj(),
word.getMinYDirAdj(),
horizontalRulingLines,
word.getDir().getDegrees(),
word.getPageWidth(),
word.getPageHeight()) //
|| isSplitByRuling(minX,
minY,
word.getMinXDirAdj(),
word.getMaxYDirAdj(),
verticalRulingLines,
word.getDir().getDegrees(),
word.getPageWidth(),
word.getPageHeight());
}
private boolean isSplitByRuling(float previousX2, float previousY1, float currentX1, float currentY1, List<Ruling> rulingLines, float dir, float pageWidth, float pageHeight) {
for (Ruling ruling : rulingLines) {
var line = RulingTextDirAdjustUtil.convertToDirAdj(ruling, dir, pageWidth, pageHeight);
if (line.intersectsLine(previousX2, previousY1, currentX1, currentY1)) {
return true;
}
}
return false;
}
private double round(float value, int decimalPoints) {
var d = Math.pow(10, decimalPoints);
return Math.round(value * d) / d;
}
}

View File

@ -1,4 +1,4 @@
package com.knecon.fforesight.service.layoutparser.processor.classification.service;
package com.knecon.fforesight.service.layoutparser.processor.services.blockification;
import java.util.ArrayList;
import java.util.Iterator;
@ -9,17 +9,17 @@ import java.util.regex.Pattern;
import org.springframework.stereotype.Service;
import com.knecon.fforesight.service.layoutparser.processor.classification.model.AbstractPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.classification.model.ClassificationPage;
import com.knecon.fforesight.service.layoutparser.processor.classification.model.Orientation;
import com.knecon.fforesight.service.layoutparser.processor.classification.model.table.Ruling;
import com.knecon.fforesight.service.layoutparser.processor.classification.model.text.TextPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.classification.model.text.TextPositionSequence;
import com.knecon.fforesight.service.layoutparser.processor.classification.utils.RulingTextDirAdjustUtil;
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
import com.knecon.fforesight.service.layoutparser.processor.model.Orientation;
import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
import com.knecon.fforesight.service.layoutparser.processor.utils.RulingTextDirAdjustUtil;
@Service
@SuppressWarnings("all")
public class BlockificationService {
public class TaasBlockificationService {
private static final float THRESHOLD = 1f;
private static final float Y_GAP_SPLIT_HEIGHT_MODIFIER = 1.25f;
@ -137,7 +137,7 @@ public class BlockificationService {
float minX = 1000, maxX = 0, minY = 1000, maxY = 0;
TextPositionSequence prev = null;
// TODO: make static final constant
var listIdentitifier = Pattern.compile("\\b(?:[1-9]|1\\d|20|[ivxlc]|[a-z])\\s*(?:[.)])", Pattern.CASE_INSENSITIVE);
boolean wasSplitted = false;
@ -146,7 +146,7 @@ public class BlockificationService {
Matcher listIdentifierPattern = listIdentitifier.matcher(word.toString());
boolean yGap = word.getMinYDirAdj() - maxY > word.getHeight() * Y_GAP_SPLIT_HEIGHT_MODIFIER;
boolean yGap = prev != null && word.getMinYDirAdj() - maxY > Math.min(word.getHeight(), prev.getHeight()) * Y_GAP_SPLIT_HEIGHT_MODIFIER;
boolean sameLine = prev != null && equalsWithThreshold(prev.getMinYDirAdj(), word.getMinYDirAdj());
boolean positiveXGapInline = prev != null && maxX + X_GAP_SPLIT_CONSTANT < word.getMinXDirAdj() && sameLine;
boolean negativeXGap = prev != null && word.getMinXDirAdj() - minX < -5;

View File

@ -0,0 +1,117 @@
package com.knecon.fforesight.service.layoutparser.processor.services.classification;
import java.util.List;
import java.util.Locale;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.springframework.stereotype.Service;
import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.Rectangle;
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationDocument;
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
import com.knecon.fforesight.service.layoutparser.processor.model.PageBlockType;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.services.BodyTextFrameService;
import com.knecon.fforesight.service.layoutparser.processor.utils.PositionUtils;
import lombok.RequiredArgsConstructor;
import lombok.extern.slf4j.Slf4j;
@Slf4j
@Service
@RequiredArgsConstructor
public class DocuMineClassificationService {
private final BodyTextFrameService bodyTextFrameService;
private static final Pattern pattern = Pattern.compile("^(\\d{1,1}\\.?){1,3}\\d{1,2}\\.?\\s[0-9A-Za-z\\[\\]\\-]{2,50}", Pattern.CASE_INSENSITIVE);
private static final Pattern pattern2 = Pattern.compile(".*\\d{4}$", Pattern.CASE_INSENSITIVE);
private static final Pattern pattern3 = Pattern.compile("^(\\d{1,1}\\.){1,3}\\d{1,2}\\.?\\s[a-z]{1,2}\\/[a-z]{1,2}.*");
public void classifyDocument(ClassificationDocument document) {
Rectangle bodyTextFrame = bodyTextFrameService.calculateBodyTextFrame(document.getPages(), document.getFontSizeCounter(), false);
Rectangle landscapeBodyTextFrame = bodyTextFrameService.calculateBodyTextFrame(document.getPages(), document.getFontSizeCounter(), true);
List<Float> headlineFontSizes = document.getFontSizeCounter().getHighterThanMostPopular();
log.debug("Document FontSize counters are: {}", document.getFontSizeCounter().getCountPerValue());
for (ClassificationPage page : document.getPages()) {
bodyTextFrameService.setBodyTextFrameAdjustedToPage(page, bodyTextFrame, landscapeBodyTextFrame);
classifyPage(page, document, headlineFontSizes);
}
}
private void classifyPage(ClassificationPage page, ClassificationDocument document, List<Float> headlineFontSizes) {
for (AbstractPageBlock textBlock : page.getTextBlocks()) {
if (textBlock instanceof TextPageBlock) {
classifyBlock((TextPageBlock) textBlock, page, document, headlineFontSizes);
}
}
}
private void classifyBlock(TextPageBlock textBlock, ClassificationPage page, ClassificationDocument document, List<Float> headlineFontSizes) {
log.debug("headlineFontSizes: {}", headlineFontSizes);
var bodyTextFrame = page.getBodyTextFrame();
Matcher matcher = pattern.matcher(textBlock.toString());
Matcher matcher2 = pattern2.matcher(textBlock.toString());
Matcher matcher3 = pattern3.matcher(textBlock.toString());
if (document.getFontSizeCounter().getMostPopular() == null) {
textBlock.setClassification(PageBlockType.OTHER);
return;
}
if (textBlock.getText().length() > 6 && (textBlock.getMostPopularWordHeight() > document.getTextHeightCounter()
.getMostPopular() || textBlock.getMostPopularWordFontSize() > document.getFontSizeCounter().getMostPopular()) && PositionUtils.getApproxLineCount(textBlock) < 5.9
&& (textBlock.getMostPopularWordStyle().contains("bold") && Character.isDigit(textBlock.toString().charAt(0)) && !matcher2.matches() && !textBlock.toString()
.contains(":") || textBlock.toString().equals(textBlock.toString().toUpperCase(Locale.ROOT)) && !matcher2.matches() && !textBlock.toString()
.contains(":") || textBlock.toString().startsWith("APPENDIX") || textBlock.toString().startsWith("FIGURE") || textBlock.toString()
.startsWith("TABLE")) && !textBlock.toString().endsWith(":")) {
textBlock.setClassification(PageBlockType.getHeadlineType(1));
document.setHeadlines(true);
} else if (matcher.find() && PositionUtils.getApproxLineCount(textBlock) < 2.9 && !matcher3.matches() && !matcher2.matches()) {
textBlock.setClassification(PageBlockType.getHeadlineType(2));
document.setHeadlines(true);
} else if (PositionUtils.isOverBodyTextFrame(bodyTextFrame, textBlock, page.getRotation()) && (document.getFontSizeCounter()
.getMostPopular() == null || textBlock.getHighestFontSize() <= document.getFontSizeCounter().getMostPopular())) {
textBlock.setClassification(PageBlockType.HEADER);
} else if (PositionUtils.isUnderBodyTextFrame(bodyTextFrame, textBlock, page.getRotation()) && (document.getFontSizeCounter()
.getMostPopular() == null || textBlock.getHighestFontSize() <= document.getFontSizeCounter().getMostPopular())) {
textBlock.setClassification(PageBlockType.FOOTER);
} else if (page.getPageNumber() == 1 && (PositionUtils.getHeightDifferenceBetweenChunkWordAndDocumentWord(textBlock,
document.getTextHeightCounter().getMostPopular()) > 2.5 && textBlock.getHighestFontSize() > document.getFontSizeCounter().getMostPopular() || page.getTextBlocks()
.size() == 1)) {
if (!Pattern.matches("[0-9]+", textBlock.toString())) {
textBlock.setClassification(PageBlockType.TITLE);
}
} else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock) && textBlock.getMostPopularWordFontSize() == document.getFontSizeCounter()
.getMostPopular() && textBlock.getMostPopularWordStyle().equals("bold") && !document.getFontStyleCounter().getMostPopular().equals("bold")) {
textBlock.setClassification(PageBlockType.PARAGRAPH_BOLD);
} else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock) && textBlock.getMostPopularWordFont()
.equals(document.getFontCounter().getMostPopular()) && textBlock.getMostPopularWordStyle()
.equals(document.getFontStyleCounter().getMostPopular()) && textBlock.getMostPopularWordFontSize() == document.getFontSizeCounter().getMostPopular()) {
textBlock.setClassification(PageBlockType.PARAGRAPH);
} else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock) && textBlock.getMostPopularWordFontSize() == document.getFontSizeCounter()
.getMostPopular() && textBlock.getMostPopularWordStyle().equals("italic") && !document.getFontStyleCounter()
.getMostPopular()
.equals("italic") && PositionUtils.getApproxLineCount(textBlock) < 2.9) {
textBlock.setClassification(PageBlockType.PARAGRAPH_ITALIC);
} else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock)) {
textBlock.setClassification(PageBlockType.PARAGRAPH_UNKNOWN);
} else {
textBlock.setClassification(PageBlockType.OTHER);
}
}
}

View File

@ -0,0 +1,116 @@
package com.knecon.fforesight.service.layoutparser.processor.services.classification;
import java.util.List;
import java.util.regex.Pattern;
import org.springframework.stereotype.Service;
import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.Rectangle;
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationDocument;
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
import com.knecon.fforesight.service.layoutparser.processor.model.PageBlockType;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.services.BodyTextFrameService;
import com.knecon.fforesight.service.layoutparser.processor.utils.PositionUtils;
import lombok.RequiredArgsConstructor;
import lombok.extern.slf4j.Slf4j;
@Slf4j
@Service
@RequiredArgsConstructor
public class RedactManagerClassificationService {
private final BodyTextFrameService bodyTextFrameService;
public void classifyDocument(ClassificationDocument document) {
Rectangle bodyTextFrame = bodyTextFrameService.calculateBodyTextFrame(document.getPages(), document.getFontSizeCounter(), false);
Rectangle landscapeBodyTextFrame = bodyTextFrameService.calculateBodyTextFrame(document.getPages(), document.getFontSizeCounter(), true);
List<Float> headlineFontSizes = document.getFontSizeCounter().getHighterThanMostPopular();
log.debug("Document FontSize counters are: {}", document.getFontSizeCounter().getCountPerValue());
for (ClassificationPage page : document.getPages()) {
bodyTextFrameService.setBodyTextFrameAdjustedToPage(page, bodyTextFrame, landscapeBodyTextFrame);
classifyPage(page, document, headlineFontSizes);
}
}
private void classifyPage(ClassificationPage page, ClassificationDocument document, List<Float> headlineFontSizes) {
for (AbstractPageBlock textBlock : page.getTextBlocks()) {
if (textBlock instanceof TextPageBlock) {
classifyBlock((TextPageBlock) textBlock, page, document, headlineFontSizes);
}
}
}
private void classifyBlock(TextPageBlock textBlock, ClassificationPage page, ClassificationDocument document, List<Float> headlineFontSizes) {
var bodyTextFrame = page.getBodyTextFrame();
if (document.getFontSizeCounter().getMostPopular() == null) {
textBlock.setClassification(PageBlockType.OTHER);
return;
}
if (PositionUtils.isOverBodyTextFrame(bodyTextFrame, textBlock, page.getRotation()) && (document.getFontSizeCounter()
.getMostPopular() == null || textBlock.getHighestFontSize() <= document.getFontSizeCounter().getMostPopular())) {
textBlock.setClassification(PageBlockType.HEADER);
} else if (PositionUtils.isUnderBodyTextFrame(bodyTextFrame, textBlock, page.getRotation()) && (document.getFontSizeCounter()
.getMostPopular() == null || textBlock.getHighestFontSize() <= document.getFontSizeCounter().getMostPopular())) {
textBlock.setClassification(PageBlockType.FOOTER);
} else if (page.getPageNumber() == 1 && (PositionUtils.getHeightDifferenceBetweenChunkWordAndDocumentWord(textBlock,
document.getTextHeightCounter().getMostPopular()) > 2.5 && textBlock.getHighestFontSize() > document.getFontSizeCounter().getMostPopular() || page.getTextBlocks()
.size() == 1)) {
if (!Pattern.matches("[0-9]+", textBlock.toString())) {
textBlock.setClassification(PageBlockType.TITLE);
}
} else if (textBlock.getMostPopularWordFontSize() > document.getFontSizeCounter()
.getMostPopular() && PositionUtils.getApproxLineCount(textBlock) < 4.9 && (textBlock.getMostPopularWordStyle().equals("bold") || !document.getFontStyleCounter()
.getCountPerValue()
.containsKey("bold") && textBlock.getMostPopularWordFontSize() > document.getFontSizeCounter().getMostPopular() + 1) && textBlock.getSequences()
.get(0)
.getTextPositions()
.get(0)
.getFontSizeInPt() >= textBlock.getMostPopularWordFontSize()) {
for (int i = 1; i <= headlineFontSizes.size(); i++) {
if (textBlock.getMostPopularWordFontSize() == headlineFontSizes.get(i - 1)) {
textBlock.setClassification(PageBlockType.getHeadlineType(i));
document.setHeadlines(true);
}
}
} else if (!textBlock.getText().startsWith("Figure ") && PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock) && textBlock.getMostPopularWordStyle()
.equals("bold") && !document.getFontStyleCounter().getMostPopular().equals("bold") && PositionUtils.getApproxLineCount(textBlock) < 2.9 && textBlock.getSequences()
.get(0)
.getTextPositions()
.get(0)
.getFontSizeInPt() >= textBlock.getMostPopularWordFontSize()) {
textBlock.setClassification(PageBlockType.getHeadlineType(headlineFontSizes.size() + 1));
document.setHeadlines(true);
} else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock) && textBlock.getMostPopularWordFontSize() == document.getFontSizeCounter()
.getMostPopular() && textBlock.getMostPopularWordStyle().equals("bold") && !document.getFontStyleCounter().getMostPopular().equals("bold")) {
textBlock.setClassification(PageBlockType.PARAGRAPH_BOLD);
} else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock) && textBlock.getMostPopularWordFont()
.equals(document.getFontCounter().getMostPopular()) && textBlock.getMostPopularWordStyle()
.equals(document.getFontStyleCounter().getMostPopular()) && textBlock.getMostPopularWordFontSize() == document.getFontSizeCounter().getMostPopular()) {
textBlock.setClassification(PageBlockType.PARAGRAPH);
} else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock) && textBlock.getMostPopularWordFontSize() == document.getFontSizeCounter()
.getMostPopular() && textBlock.getMostPopularWordStyle().equals("italic") && !document.getFontStyleCounter()
.getMostPopular()
.equals("italic") && PositionUtils.getApproxLineCount(textBlock) < 2.9) {
textBlock.setClassification(PageBlockType.PARAGRAPH_ITALIC);
} else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock)) {
textBlock.setClassification(PageBlockType.PARAGRAPH_UNKNOWN);
} else {
textBlock.setClassification(PageBlockType.OTHER);
}
}
}

View File

@ -1,4 +1,4 @@
package com.knecon.fforesight.service.layoutparser.processor.classification.service;
package com.knecon.fforesight.service.layoutparser.processor.services.classification;
import java.util.List;
import java.util.regex.Pattern;
@ -6,12 +6,13 @@ import java.util.regex.Pattern;
import org.springframework.stereotype.Service;
import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.Rectangle;
import com.knecon.fforesight.service.layoutparser.processor.classification.model.AbstractPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.classification.model.ClassificationDocument;
import com.knecon.fforesight.service.layoutparser.processor.classification.model.ClassificationPage;
import com.knecon.fforesight.service.layoutparser.processor.classification.model.PageBlockType;
import com.knecon.fforesight.service.layoutparser.processor.classification.model.text.TextPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.classification.utils.PositionUtils;
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationDocument;
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
import com.knecon.fforesight.service.layoutparser.processor.model.PageBlockType;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.services.BodyTextFrameService;
import com.knecon.fforesight.service.layoutparser.processor.utils.PositionUtils;
import lombok.RequiredArgsConstructor;
import lombok.extern.slf4j.Slf4j;
@ -19,7 +20,7 @@ import lombok.extern.slf4j.Slf4j;
@Slf4j
@Service
@RequiredArgsConstructor
public class ClassificationService {
public class TaasClassificationService {
private final BodyTextFrameService bodyTextFrameService;

View File

@ -9,7 +9,7 @@
* This program is free software under the LGPL (>=v2.1)
* Read the file LICENSE.txt coming with the sources for details.
*/
package com.knecon.fforesight.service.layoutparser.processor.classification.utils;
package com.knecon.fforesight.service.layoutparser.processor.utils;
import java.awt.geom.Line2D;
import java.awt.geom.Rectangle2D;

View File

@ -1,4 +1,4 @@
package com.knecon.fforesight.service.layoutparser.processor.classification.utils;
package com.knecon.fforesight.service.layoutparser.processor.utils;
import java.math.BigDecimal;
import java.util.Comparator;

View File

@ -1,6 +1,7 @@
package com.knecon.fforesight.service.layoutparser.processor.utils;
import java.awt.Color;
import java.awt.geom.Line2D;
import java.awt.geom.Point2D;
import java.awt.geom.Rectangle2D;
import java.io.IOException;
@ -150,6 +151,32 @@ public class PdfVisualisationUtility {
}
@SneakyThrows
public static void drawLine2DList(PDDocument pdDocument, int pageNumber, List<Line2D> line2DS, Options options) {
var pdPage = pdDocument.getPage(pageNumber - 1);
var contentStream = new PDPageContentStream(pdDocument, pdPage, PDPageContentStream.AppendMode.APPEND, true);
contentStream.setStrokingColor(options.getStrokeColor());
contentStream.setNonStrokingColor(options.getFillColor());
contentStream.setLineWidth(options.getStrokeWidth());
for (var line2D : line2DS) {
contentStream.moveTo((float) line2D.getX1(), (float) line2D.getY1());
contentStream.lineTo((float) line2D.getX2(), (float) line2D.getY2());
if (options.isStroke() && options.isFill()) {
contentStream.fillAndStroke();
} else if (options.isStroke()) {
contentStream.stroke();
} else if (options.isFill()) {
contentStream.fill();
}
}
contentStream.close();
}
@Builder
@Getter
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)

View File

@ -1,7 +1,7 @@
package com.knecon.fforesight.service.layoutparser.processor.classification.utils;
package com.knecon.fforesight.service.layoutparser.processor.utils;
import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.Rectangle;
import com.knecon.fforesight.service.layoutparser.processor.classification.model.text.TextPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
import lombok.experimental.UtilityClass;

View File

@ -1,4 +1,4 @@
package com.knecon.fforesight.service.layoutparser.processor.classification.utils;
package com.knecon.fforesight.service.layoutparser.processor.utils;
import java.util.ArrayDeque;
import java.util.Comparator;

View File

@ -1,5 +1,7 @@
package com.knecon.fforesight.service.layoutparser.processor.utils;
import static java.lang.String.format;
import java.awt.geom.Rectangle2D;
import java.awt.geom.RectangularShape;
import java.util.Collections;
@ -23,6 +25,27 @@ import lombok.NoArgsConstructor;
public class RectangleTransformations {
public static Rectangle2D pad(Rectangle2D rectangle2D, int deltaX, int deltaY) {
return new Rectangle2D.Double(rectangle2D.getMinX() - deltaX, rectangle2D.getMinY() - deltaY, rectangle2D.getWidth() + 2 * deltaX, rectangle2D.getHeight() + 2 * deltaY);
}
public static Rectangle2D pad(Rectangle2D rectangle2D, double deltaX, double deltaY) {
return new Rectangle2D.Double(rectangle2D.getMinX() - deltaX, rectangle2D.getMinY() - deltaY, rectangle2D.getWidth() + 2 * deltaX, rectangle2D.getHeight() + 2 * deltaY);
}
public static Rectangle2D bBoxUnionAtomicTextBlock(List<AtomicTextBlock> atomicTextBlocks) {
return atomicTextBlocks.stream().flatMap(atomicTextBlock -> atomicTextBlock.getPositions().stream()).collect(new Rectangle2DBBoxCollector());
}
public static Collector<Rectangle2D, Rectangle2DBBoxCollector.BBox, Rectangle2D> collectBBox() {
return new Rectangle2DBBoxCollector();
}
public static PDRectangle toPDRectangleBBox(List<Rectangle> rectangles) {
Rectangle2D rectangle2D = RectangleTransformations.rectangleBBox(rectangles);
@ -42,6 +65,11 @@ public class RectangleTransformations {
}
public static String toString(Rectangle2D rectangle2D) {
return format("%f,%f,%f,%f", rectangle2D.getX(), rectangle2D.getY(), rectangle2D.getWidth(), rectangle2D.getHeight());
}
public static Rectangle2D rectangleBBox(List<Rectangle> rectangles) {
return rectangles.stream().map(RectangleTransformations::toRectangle2D).collect(new Rectangle2DBBoxCollector());
@ -56,6 +84,11 @@ public class RectangleTransformations {
-redactionLogRectangle.getHeight());
}
public static Rectangle2D toRectangle2D(PDRectangle rectangle) {
return new Rectangle2D.Double(rectangle.getLowerLeftX(), rectangle.getLowerLeftY(), rectangle.getWidth(), rectangle.getHeight());
}
public static Rectangle toRedactionLogRectangle(Rectangle2D rectangle2D, int pageNumber) {

View File

@ -1,9 +1,9 @@
package com.knecon.fforesight.service.layoutparser.processor.classification.utils;
package com.knecon.fforesight.service.layoutparser.processor.utils;
import java.awt.geom.Line2D;
import java.awt.geom.Point2D;
import com.knecon.fforesight.service.layoutparser.processor.classification.model.table.Ruling;
import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling;
import lombok.experimental.UtilityClass;

View File

@ -5,15 +5,18 @@ import java.util.LinkedList;
import java.util.List;
import java.util.stream.Stream;
import com.knecon.fforesight.service.layoutparser.processor.classification.model.AbstractPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.classification.model.table.Cell;
import com.knecon.fforesight.service.layoutparser.processor.classification.model.table.TablePageBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.table.Cell;
import com.knecon.fforesight.service.layoutparser.processor.model.table.TablePageBlock;
import lombok.experimental.UtilityClass;
@UtilityClass
public class TableMergingUtility {
private static final double TABLE_ALIGNMENT_THRESHOLD = 2d;
public List<TablePageBlock> findConsecutiveTablesWithSameColCountAndSameHeaders(TablePageBlock originalTablePageBlock, List<AbstractPageBlock> pageBlocks) {
List<TablePageBlock> consecutiveTables = pageBlocks.stream()
@ -24,7 +27,8 @@ public class TableMergingUtility {
List<TablePageBlock> consecutiveTablesWithSameColCountAndHeaders = new LinkedList<>();
for (TablePageBlock consecutiveTable : consecutiveTables) {
if (consecutiveTable.getColCount() == originalTablePageBlock.getColCount() && !hasTableHeader(consecutiveTable)) {
if (consecutiveTable.getColCount() == originalTablePageBlock.getColCount() && !hasTableHeader(consecutiveTable) && outerBoundaryAlignsX(originalTablePageBlock,
consecutiveTable)) {
consecutiveTablesWithSameColCountAndHeaders.add(consecutiveTable);
} else {
break;
@ -34,6 +38,12 @@ public class TableMergingUtility {
}
private static boolean outerBoundaryAlignsX(TablePageBlock originalTablePageBlock, TablePageBlock consecutiveTable) {
return Math.abs(consecutiveTable.getMinX() - originalTablePageBlock.getMinX()) < TABLE_ALIGNMENT_THRESHOLD && Math.abs(consecutiveTable.getMaxX() - originalTablePageBlock.getMaxX()) < TABLE_ALIGNMENT_THRESHOLD;
}
private boolean hasTableHeader(TablePageBlock table) {
return table.getRows().stream().flatMap(Collection::stream).anyMatch(Cell::isHeaderCell);

View File

@ -1,4 +1,4 @@
package com.knecon.fforesight.service.layoutparser.processor.classification.utils;
package com.knecon.fforesight.service.layoutparser.processor.utils;
import lombok.experimental.UtilityClass;

View File

@ -3,8 +3,8 @@ package com.knecon.fforesight.service.layoutparser.processor.utils;
import java.util.Comparator;
import java.util.List;
import com.knecon.fforesight.service.layoutparser.processor.classification.model.text.TextPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.classification.model.text.TextPositionSequence;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
public class TextPositionOperations {

View File

@ -0,0 +1,72 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.knecon.fforesight.service.layoutparser.processor.utils;
import java.util.Comparator;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
/**
* This class is a comparator for TextPosition operators. It handles
* pages with text in different directions by grouping the text based
* on direction and sorting in that direction. This allows continuous text
* in a given direction to be more easily grouped together.
*
* @author Ben Litchfield
*/
public class TextPositionSequenceComparator implements Comparator<TextPositionSequence>
{
@Override
public int compare(TextPositionSequence pos1, TextPositionSequence pos2)
{
// only compare text that is in the same direction
int cmp1 = Float.compare(pos1.getDir().getDegrees(), pos2.getDir().getDegrees());
if (cmp1 != 0)
{
return cmp1;
}
// get the text direction adjusted coordinates
float x1 = pos1.getMinXDirAdj();
float x2 = pos2.getMinXDirAdj();
float pos1YBottom = pos1.getMaxYDirAdj();
float pos2YBottom = pos2.getMaxYDirAdj();
// note that the coordinates have been adjusted so 0,0 is in upper left
float pos1YTop = pos1YBottom - pos1.getTextHeight();
float pos2YTop = pos2YBottom - pos2.getTextHeight();
float yDifference = Math.abs(pos1YBottom - pos2YBottom);
// we will do a simple tolerance comparison
if (yDifference < .1 ||
pos2YBottom >= pos1YTop && pos2YBottom <= pos1YBottom ||
pos1YBottom >= pos2YTop && pos1YBottom <= pos2YBottom)
{
return Float.compare(x1, x2);
}
else if (pos1YBottom < pos2YBottom)
{
return -1;
}
else
{
return 1;
}
}
}

View File

@ -0,0 +1,16 @@
<Configuration>
<Appenders>
<Console name="CONSOLE" target="SYSTEM_OUT">
<PatternLayout pattern="%d{HH:mm:ss.SSS} [%t] %-5level %logger{36} - %msg%n"/>
</Console>
</Appenders>
<Loggers>
<Root level="warn">
<AppenderRef ref="CONSOLE"/>
</Root>
<Logger name="com.iqser" level="info"/>
</Loggers>
</Configuration>

View File

@ -20,7 +20,8 @@ import org.springframework.beans.factory.annotation.Autowired;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.knecon.fforesight.service.layoutparser.internal.api.data.taas.ResearchDocumentData;
import com.knecon.fforesight.service.layoutparser.processor.LayoutParsingService;
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType;
import com.knecon.fforesight.service.layoutparser.processor.LayoutParsingPipeline;
import com.knecon.fforesight.service.layoutparser.processor.adapter.model.image.ImageServiceResponse;
import com.knecon.fforesight.service.layoutparser.processor.adapter.model.table.TableServiceResponse;
import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.Document;
@ -37,7 +38,7 @@ public class BdrJsonBuildTest extends BaseTest {
private ObjectMapper objectMapper;
@Autowired
private LayoutParsingService layoutParsingService;
private LayoutParsingPipeline layoutParsingPipeline;
@SneakyThrows
@ -45,7 +46,7 @@ public class BdrJsonBuildTest extends BaseTest {
try (InputStream inputStream = new FileInputStream(filename)) {
PDDocument pdDocument = Loader.loadPDF(inputStream);
return layoutParsingService.parseLayoutWithTimer(pdDocument, new ImageServiceResponse(), new TableServiceResponse());
return layoutParsingPipeline.parseLayoutWithTimer(LayoutParsingType.REDACT_MANAGER, pdDocument, new ImageServiceResponse(), new TableServiceResponse());
}
}

View File

@ -11,7 +11,8 @@ import org.junit.jupiter.api.Test;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.core.io.ClassPathResource;
import com.knecon.fforesight.service.layoutparser.processor.LayoutParsingService;
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType;
import com.knecon.fforesight.service.layoutparser.processor.LayoutParsingPipeline;
import com.knecon.fforesight.service.layoutparser.processor.adapter.model.table.TableServiceResponse;
import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.Document;
import com.knecon.fforesight.service.layoutparser.server.utils.BaseTest;
@ -21,7 +22,7 @@ import lombok.SneakyThrows;
public class BuildDocumentGraphTest extends BaseTest {
@Autowired
private LayoutParsingService layoutParsingService;
private LayoutParsingPipeline layoutParsingPipeline;
@Test
@Disabled
@ -50,7 +51,7 @@ public class BuildDocumentGraphTest extends BaseTest {
try (InputStream inputStream = fileResource.getInputStream()) {
PDDocument pdDocument = Loader.loadPDF(inputStream);
return layoutParsingService.parseLayout(pdDocument, layoutParsingStorageService.getImagesFile(IMAGE_FILE_ID), new TableServiceResponse());
return layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER, pdDocument, layoutParsingStorageService.getImagesFile(IMAGE_FILE_ID), new TableServiceResponse());
}
}

View File

@ -25,7 +25,8 @@ public class DocumentGraphMappingTest extends BuildDocumentGraphTest {
DocumentData documentData = DocumentDataMapper.toDocumentData(document);
var researchDocumentData = TaasDocumentDataMapper.fromDocument(document);
layoutParsingStorageService.storeDocumentData(layoutParsingRequest, researchDocumentData, documentData);
layoutParsingStorageService.storeDocumentData(layoutParsingRequest, documentData);
DocumentData documentData2 = layoutParsingStorageService.readDocumentData(layoutParsingRequest);
Document newDocumentGraph = DocumentGraphMapper.toDocumentGraph(documentData2);

View File

@ -0,0 +1,58 @@
package com.knecon.fforesight.service.layoutparser.server.model;
import static org.junit.jupiter.api.Assertions.assertFalse;
import static org.junit.jupiter.api.Assertions.assertTrue;
import org.junit.jupiter.api.Test;
import com.knecon.fforesight.service.layoutparser.processor.model.SectionIdentifier;
class SectionIdentifierTest {
@Test
public void testParentOf() {
var headline = SectionIdentifier.fromSearchText("1 Did you ever hear the tragedy of Darth Plagueis The Wise?");
var headline1 = SectionIdentifier.fromSearchText("1.0 I thought not. Its not a story the Jedi would tell you.");
var headline2 = SectionIdentifier.fromSearchText("1.1 Its a Sith legend. Darth Plagueis was a Dark Lord of the Sith, ");
var headline3 = SectionIdentifier.fromSearchText("1.2.3 so powerful and so wise he could use the Force to influence the midichlorians to create life…");
var headline4 = SectionIdentifier.fromSearchText("1.2.3.4 He had such a knowledge of the dark side that he could even keep the ones he cared about from dying.");
var headline5 = SectionIdentifier.fromSearchText("1.2.3.4.5 The dark side of the Force is a pathway to many abilities some consider to be unnatural.");
var headline6 = SectionIdentifier.fromSearchText("2.0 He became so powerful…");
var headline7 = SectionIdentifier.fromSearchText("10000.0 the only thing he was afraid of was losing his power,");
var headline8 = SectionIdentifier.fromSearchText("A.0 which eventually, of course, he did.");
var headline9 = SectionIdentifier.fromSearchText("Unfortunately, he taught his apprentice everything he knew, then his apprentice killed him in his sleep.");
var headline10 = SectionIdentifier.fromSearchText("2.1.2 Ironic.");
var headline11 = SectionIdentifier.fromSearchText("2.He could save others from death,");
var headline12 = SectionIdentifier.fromSearchText(" 2. but not himself.");
var paragraph1 = SectionIdentifier.asChildOf(headline);
assertTrue(paragraph1.isChildOf(headline));
assertTrue(headline.isParentOf(paragraph1));
assertFalse(paragraph1.isParentOf(headline));
assertFalse(headline.isParentOf(headline1));
assertTrue(headline.isParentOf(headline2));
assertTrue(headline.isParentOf(headline3));
assertTrue(headline.isParentOf(headline4));
assertTrue(headline.isParentOf(headline5));
assertTrue(headline1.isParentOf(headline2));
assertFalse(headline1.isParentOf(headline1));
assertTrue(headline3.isParentOf(headline4));
assertFalse(headline4.isParentOf(headline5));
assertFalse(headline2.isParentOf(headline3));
assertFalse(headline2.isParentOf(headline4));
assertTrue(headline1.isParentOf(headline3));
assertTrue(headline1.isParentOf(headline4));
assertFalse(headline1.isParentOf(headline6));
assertFalse(headline1.isParentOf(headline7));
assertFalse(headline8.isParentOf(headline1));
assertFalse(headline8.isParentOf(headline2));
assertFalse(headline8.isParentOf(headline3));
assertFalse(headline8.isParentOf(headline4));
assertFalse(headline9.isParentOf(headline9));
assertTrue(headline10.isChildOf(headline11));
assertTrue(headline10.isChildOf(headline12));
}
}

View File

@ -0,0 +1,71 @@
package com.knecon.fforesight.service.layoutparser.server.services;
import java.awt.geom.Rectangle2D;
import java.util.LinkedList;
import java.util.List;
import org.junit.jupiter.api.Test;
import com.knecon.fforesight.service.layoutparser.processor.model.GapInformation;
import com.knecon.fforesight.service.layoutparser.processor.model.PageContents;
import com.knecon.fforesight.service.layoutparser.processor.model.PageInformation;
import com.knecon.fforesight.service.layoutparser.processor.services.DividingColumnDetectionService;
import com.knecon.fforesight.service.layoutparser.processor.services.GapDetectionService;
import com.knecon.fforesight.service.layoutparser.processor.services.GapsAcrossLinesService;
import com.knecon.fforesight.service.layoutparser.processor.services.PageInformationService;
import com.knecon.fforesight.service.layoutparser.processor.services.TextPositionSequenceSorter;
import com.knecon.fforesight.service.layoutparser.server.utils.visualizations.PdfDraw;
import lombok.SneakyThrows;
class ColumnDetectionServiceTest {
@Test
@SneakyThrows
public void testGapBasedColumnDetection() {
String filename = "files/Metolachlor/S-Metolachlor_RAR_01_Volume_1_2018-09-06.pdf";
var tmpFileName = "/tmp/" + filename.split("/")[2] + "_COLUMNS.pdf";
System.out.println("start TextPosition extraction");
long start = System.currentTimeMillis();
List<PageInformation> pageInformations = TextPositionSequenceSorter.getSortedTextPositionsWithPages(filename).stream().map(PageInformationService::build).toList();
List<List<Rectangle2D>> columnsPerPage = new LinkedList<>();
System.out.printf("Finished TextPosition Extraction in %d ms%n", System.currentTimeMillis() - start);
System.out.println("start column detection");
start = System.currentTimeMillis();
for (PageInformation pageInformation : pageInformations) {
GapInformation gapInformation = GapDetectionService.findGapsInLines(pageInformation.getPageContents().getSortedTextPositionSequences(), pageInformation.getMainBodyTextFrame());
columnsPerPage.add(GapsAcrossLinesService.detectXGapsAcrossLines(gapInformation, pageInformation.getMainBodyTextFrame()));
}
System.out.printf("Finished column detection in %d ms%n", System.currentTimeMillis() - start);
System.out.println("start draw rectangles");
start = System.currentTimeMillis();
PdfDraw.drawRectanglesPerPage(filename, columnsPerPage, tmpFileName);
System.out.printf("Finished drawing rectangles in %d ms%n", System.currentTimeMillis() - start);
}
@Test
@SneakyThrows
public void testColumnDetection() {
String filename = "files/Metolachlor/S-Metolachlor_RAR_01_Volume_1_2018-09-06.pdf";
var tmpFileName = "/tmp/" + filename.split("/")[2] + "_COLUMNS.pdf";
System.out.println("start TextPosition extraction");
long start = System.currentTimeMillis();
List<PageContents> sortedTextPositionSequencesPerPage = TextPositionSequenceSorter.getSortedTextPositionsWithPages(filename);
List<List<Rectangle2D>> columnsPerPage = new LinkedList<>();
System.out.printf("Finished TextPosition Extraction in %d ms%n", System.currentTimeMillis() - start);
System.out.println("start column detection");
start = System.currentTimeMillis();
for (PageContents pageContents : sortedTextPositionSequencesPerPage) {
columnsPerPage.add(DividingColumnDetectionService.detectColumns(pageContents));
}
System.out.printf("Finished column detection in %d ms%n", System.currentTimeMillis() - start);
System.out.println("start draw rectangles");
start = System.currentTimeMillis();
PdfDraw.drawRectanglesPerPage(filename, columnsPerPage, tmpFileName);
System.out.printf("Finished drawing rectangles in %d ms%n", System.currentTimeMillis() - start);
}
}

View File

@ -0,0 +1,23 @@
package com.knecon.fforesight.service.layoutparser.processor.services;
import java.util.List;
import org.junit.jupiter.api.Test;
import com.knecon.fforesight.service.layoutparser.processor.model.PageContents;
import lombok.SneakyThrows;
class InvisibleTableDetectionServiceTest {
@Test
@SneakyThrows
public void detectInvisibleTableTest() {
String fileName = "files/test-two-pages_ocred-2.pdf";
List<PageContents> pageContents = TextPositionSequenceSorter.getSortedTextPositionsWithPages(fileName);
}
}

View File

@ -0,0 +1,7 @@
package com.knecon.fforesight.service.layoutparser.processor.services;
import static org.junit.jupiter.api.Assertions.*;
class MainBodyTextFrameExtractionServiceTest {
}

View File

@ -0,0 +1,50 @@
package com.knecon.fforesight.service.layoutparser.server.services;
import java.awt.geom.Rectangle2D;
import java.util.Collection;
import java.util.LinkedList;
import java.util.List;
import org.junit.jupiter.api.Disabled;
import org.junit.jupiter.api.Test;
import com.knecon.fforesight.service.layoutparser.processor.model.GapInformation;
import com.knecon.fforesight.service.layoutparser.processor.model.PageContents;
import com.knecon.fforesight.service.layoutparser.processor.services.GapDetectionService;
import com.knecon.fforesight.service.layoutparser.processor.services.MainBodyTextFrameExtractionService;
import com.knecon.fforesight.service.layoutparser.processor.services.TextPositionSequenceSorter;
import com.knecon.fforesight.service.layoutparser.server.utils.visualizations.PdfDraw;
import lombok.SneakyThrows;
class GapDetectionServiceTest {
@Test
@Disabled
@SneakyThrows
public void testGapDetection() {
String filename = "files/Metolachlor/S-Metolachlor_RAR_01_Volume_1_2018-09-06.pdf";
var tmpFileName = "/tmp/" + filename.split("/")[2] + "_GAPS.pdf";
System.out.println("start TextPosition extraction");
long start = System.currentTimeMillis();
List<PageContents> sortedTextPositionSequencesPerPage = TextPositionSequenceSorter.getSortedTextPositionsWithPages(filename);
List<GapInformation> gapInformationInLinesPerPage = new LinkedList<>();
System.out.printf("Finished TextPosition Extraction in %d ms%n", System.currentTimeMillis() - start);
System.out.println("start gap detection");
start = System.currentTimeMillis();
for (PageContents pageContents : sortedTextPositionSequencesPerPage) {
// List<List<TextPositionSequence>> lines = LineDetectionService.orderByLine(textPositionsWithPage.getSortedTextPositionSequences());
Rectangle2D mainBodyTextFrame = MainBodyTextFrameExtractionService.calculateMainBodyTextFrame(pageContents);
gapInformationInLinesPerPage.add(GapDetectionService.findGapsInLines(pageContents.getSortedTextPositionSequences(), mainBodyTextFrame));
}
System.out.printf("Finished gap detection in %d ms%n", System.currentTimeMillis() - start);
System.out.println("start draw rectangles");
start = System.currentTimeMillis();
PdfDraw.drawRectanglesAndLinesPerPage(filename,
gapInformationInLinesPerPage.stream().map(gaps -> gaps.getYGaps().stream().flatMap(Collection::stream).toList()).toList(),
gapInformationInLinesPerPage.stream().map(gaps -> gaps.getXGaps().stream().flatMap(Collection::stream).toList()).toList(), tmpFileName);
System.out.printf("Finished drawing rectangles in %d ms%n", System.currentTimeMillis() - start);
}
}

View File

@ -0,0 +1,39 @@
package com.knecon.fforesight.service.layoutparser.server.services;
import java.nio.file.Path;
import java.util.List;
import org.junit.jupiter.api.Test;
import com.knecon.fforesight.service.layoutparser.processor.model.PageContents;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
import com.knecon.fforesight.service.layoutparser.processor.services.RectangleTransformations;
import com.knecon.fforesight.service.layoutparser.processor.services.TextPositionSequenceSorter;
import com.knecon.fforesight.service.layoutparser.server.utils.visualizations.PdfDraw;
import lombok.SneakyThrows;
class TextPositionSequenceSorterTest {
@Test
@SneakyThrows
public void testTextPositionSequenceExtraction() {
String fileName = "files/new/test-two-pages_ocred-2.pdf";
var tmpFileName = Path.of("/tmp/").resolve(Path.of(fileName).getFileName() + "_TEXT_POSITION_SEQUENCES.pdf").toString();
List<PageContents> textPositionPerPage = TextPositionSequenceSorter.getSortedTextPositionsWithPages(fileName);
PdfDraw.drawRectanglesPerPageNumberedByLine(fileName,
textPositionPerPage.stream()
.map(t -> t.getSortedTextPositionSequences()
.stream()
.map(TextPositionSequence::getRectangle)
.map(RectangleTransformations::toRectangle2D)
//.map(textPositionSequence -> (Rectangle2D) new Rectangle2D.Double(textPositionSequence.getMaxXDirAdj(), textPositionSequence.getMaxYDirAdj(), textPositionSequence.getWidth(), textPositionSequence.getHeight()))
.map(List::of)
.toList())
.toList(), tmpFileName);
}
}

View File

@ -3,16 +3,20 @@ package com.knecon.fforesight.service.layoutparser.server.utils.visualizations;
import java.awt.Color;
import java.awt.geom.Point2D;
import java.awt.geom.Rectangle2D;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.util.List;
import java.util.Map;
import org.apache.pdfbox.Loader;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.PDPageContentStream;
import org.apache.pdfbox.pdmodel.font.PDType1Font;
import org.apache.pdfbox.pdmodel.font.Standard14Fonts;
import org.apache.pdfbox.util.Matrix;
import org.springframework.core.io.ClassPathResource;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeType;
import com.knecon.fforesight.service.layoutparser.processor.graph.DocumentTree;
@ -20,7 +24,8 @@ import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.Document
import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.Page;
import com.knecon.fforesight.service.layoutparser.processor.graph.textblock.AtomicTextBlock;
import com.knecon.fforesight.service.layoutparser.processor.graph.textblock.TextBlock;
import com.knecon.fforesight.service.layoutparser.processor.services.RectangleTransformations;
import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations;
import com.knecon.fforesight.service.layoutparser.processor.utils.PdfVisualisationUtility;
import lombok.AccessLevel;
import lombok.AllArgsConstructor;
@ -34,6 +39,68 @@ import lombok.experimental.UtilityClass;
@UtilityClass
public class PdfDraw {
public static void drawRectanglesPerPage(String filename, List<List<Rectangle2D>> rectanglesPerPage, String tmpFileName) throws IOException {
try (InputStream inputStream = new ClassPathResource(filename).getInputStream()) {
PDDocument pdDocument = Loader.loadPDF(inputStream);
for (int pageNumber = 1; pageNumber < pdDocument.getNumberOfPages() + 1; pageNumber++) {
PdfVisualisationUtility.drawRectangle2DList(pdDocument,
pageNumber,
rectanglesPerPage.get(pageNumber - 1),
PdfVisualisationUtility.Options.builder().stroke(true).build());
}
try (var out = new FileOutputStream(tmpFileName)) {
pdDocument.save(out);
pdDocument.close();
}
}
}
public static void drawRectanglesPerPageNumberedByLine(String filename, List<List<List<Rectangle2D>>> rectanglesPerPage, String tmpFileName) throws IOException {
try (InputStream inputStream = new ClassPathResource(filename).getInputStream()) {
PDDocument pdDocument = Loader.loadPDF(inputStream);
for (int pageNumber = 1; pageNumber < pdDocument.getNumberOfPages() + 1; pageNumber++) {
var rectanglesOnPage = rectanglesPerPage.get(pageNumber - 1);
for (int lineNumber = 0; lineNumber < rectanglesOnPage.size(); lineNumber++) {
var rectanglesInLine = rectanglesOnPage.get(lineNumber);
PdfVisualisationUtility.drawRectangle2DList(pdDocument, pageNumber, rectanglesInLine, PdfVisualisationUtility.Options.builder().stroke(true).build());
double y = Math.min(rectanglesInLine.get(0).getMinY(), rectanglesInLine.get(0).getMaxY());
PdfVisualisationUtility.drawText(String.format("%d", lineNumber),
pdDocument,
new Point2D.Double(rectanglesInLine.get(0).getX() - (5 + (5 * countNumberOfDigits(lineNumber))), y + 2),
pageNumber,
PdfVisualisationUtility.Options.builder().stroke(true).build());
}
}
try (var out = new FileOutputStream(tmpFileName)) {
pdDocument.save(out);
pdDocument.close();
}
}
}
private static int countNumberOfDigits(int num) {
if (num == 0) {
return 1;
}
int count = 0;
for (; num != 0; num /= 10, ++count) {
}
return count;
}
public static void drawDocumentGraph(PDDocument document, Document documentGraph) {
documentGraph.getDocumentTree().allEntriesInOrder().forEach(entry -> drawNode(document, entry));
@ -115,6 +182,35 @@ public class PdfDraw {
}
@SneakyThrows
public static void drawRectanglesAndLinesPerPage(String filename, List<List<Rectangle2D>> list, List<List<Rectangle2D>> rectanglesPerPage, String tmpFileName) {
try (InputStream inputStream = new ClassPathResource(filename).getInputStream()) {
PDDocument pdDocument = Loader.loadPDF(inputStream);
for (int pageNumber = 1; pageNumber < pdDocument.getNumberOfPages() + 1; pageNumber++) {
// PdfVisualisationUtility.drawLine2DList(pdDocument,
// pageNumber,
// list.get(pageNumber - 1),
// PdfVisualisationUtility.Options.builder().stroke(true).build());
PdfVisualisationUtility.drawRectangle2DList(pdDocument,
pageNumber,
rectanglesPerPage.get(pageNumber - 1),
PdfVisualisationUtility.Options.builder().stroke(true).build());
PdfVisualisationUtility.drawRectangle2DList(pdDocument,
pageNumber,
list.get(pageNumber - 1),
PdfVisualisationUtility.Options.builder().stroke(true).build());
}
try (var out = new FileOutputStream(tmpFileName)) {
pdDocument.save(out);
pdDocument.close();
}
}
}
@Builder
@AllArgsConstructor
@NoArgsConstructor

View File

@ -0,0 +1,37 @@
info:
description: Layout Parser Service Processor
tenant-user-management-service.url: "http://tenant-user-management-service:8080/internal"
fforesight.tenants.remote: true
server:
port: 8080
spring:
main:
allow-circular-references: true # FIXME
rabbitmq:
host: ${RABBITMQ_HOST:localhost}
port: ${RABBITMQ_PORT:5672}
username: ${RABBITMQ_USERNAME:user}
password: ${RABBITMQ_PASSWORD:rabbitmq}
listener:
simple:
acknowledge-mode: AUTO
concurrency: 2
retry:
enabled: true
max-attempts: 3
max-interval: 15000
prefetch: 1
management:
endpoint:
metrics.enabled: ${monitoring.enabled:false}
prometheus.enabled: ${monitoring.enabled:false}
health.enabled: true
endpoints.web.exposure.include: prometheus, health
storage:
backend: 's3'

View File

@ -0,0 +1,16 @@
<Configuration>
<Appenders>
<Console name="CONSOLE" target="SYSTEM_OUT">
<PatternLayout pattern="%d{HH:mm:ss.SSS} [%t] %-5level %logger{36} - %msg%n"/>
</Console>
</Appenders>
<Loggers>
<Root level="warn">
<AppenderRef ref="CONSOLE"/>
</Root>
<Logger name="com.iqser" level="info"/>
</Loggers>
</Configuration>

View File

@ -7,6 +7,7 @@
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-parent</artifactId>
<version>3.0.6</version>
<relativePath></relativePath>
</parent>
<groupId>com.knecon.fforesight</groupId>