TAAS-41/ RED-6725: integrate layoutparser into redactmanager
This commit is contained in:
parent
9c8501e76a
commit
241a32cb4f
@ -8,6 +8,7 @@ import lombok.Builder;
|
||||
|
||||
@Builder
|
||||
public record LayoutParsingRequest(
|
||||
LayoutParsingType layoutParsingType,
|
||||
Map<String, String> identifier,
|
||||
String originFileStorageId,
|
||||
Optional<String> tablesFileStorageId,
|
||||
@ -16,6 +17,7 @@ public record LayoutParsingRequest(
|
||||
String researchDocumentStorageId,
|
||||
String textBlockFileStorageId,
|
||||
String positionBlockFileStorageId,
|
||||
String pageFileStorageId) {
|
||||
String pageFileStorageId,
|
||||
String sectionGridStorageId) {
|
||||
|
||||
}
|
||||
|
||||
@ -0,0 +1,7 @@
|
||||
package com.knecon.fforesight.service.layoutparser.internal.api.queue;
|
||||
|
||||
public enum LayoutParsingType {
|
||||
REDACT_MANAGER,
|
||||
TAAS,
|
||||
DOCUMINE
|
||||
}
|
||||
@ -0,0 +1,16 @@
|
||||
<Configuration>
|
||||
|
||||
<Appenders>
|
||||
<Console name="CONSOLE" target="SYSTEM_OUT">
|
||||
<PatternLayout pattern="%d{HH:mm:ss.SSS} [%t] %-5level %logger{36} - %msg%n"/>
|
||||
</Console>
|
||||
</Appenders>
|
||||
|
||||
<Loggers>
|
||||
<Root level="warn">
|
||||
<AppenderRef ref="CONSOLE"/>
|
||||
</Root>
|
||||
<Logger name="com.iqser" level="info"/>
|
||||
</Loggers>
|
||||
|
||||
</Configuration>
|
||||
@ -60,6 +60,12 @@
|
||||
<groupId>org.springframework.boot</groupId>
|
||||
<artifactId>spring-boot-starter-amqp</artifactId>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.junit.jupiter</groupId>
|
||||
<artifactId>junit-jupiter</artifactId>
|
||||
<version>RELEASE</version>
|
||||
<scope>test</scope>
|
||||
</dependency>
|
||||
</dependencies>
|
||||
|
||||
</project>
|
||||
|
||||
@ -7,21 +7,24 @@ import java.io.IOException;
|
||||
import org.apache.pdfbox.pdmodel.PDDocument;
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.taas.ResearchDocumentData;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingFinishedEvent;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingRequest;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.adapter.CvTableParsingAdapter;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.adapter.ImageServiceResponseAdapter;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.adapter.model.image.ImageServiceResponse;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.adapter.model.table.TableServiceResponse;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.model.ClassificationDocument;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.service.ClassificationService;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.service.PdfParsingService;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.service.SectionsBuilderService;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.factory.DocumentGraphFactory;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.Document;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.mapper.redaction.DocumentDataMapper;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.mapper.taas.TaasDocumentDataMapper;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationDocument;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.PdfParsingService;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.SectionGridCreatorService;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.SectionsBuilderService;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.classification.DocuMineClassificationService;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.classification.RedactManagerClassificationService;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.classification.TaasClassificationService;
|
||||
|
||||
import lombok.RequiredArgsConstructor;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
@ -29,14 +32,17 @@ import lombok.extern.slf4j.Slf4j;
|
||||
@Slf4j
|
||||
@Service
|
||||
@RequiredArgsConstructor
|
||||
public class LayoutParsingService {
|
||||
public class LayoutParsingPipeline {
|
||||
|
||||
private final ImageServiceResponseAdapter imageServiceResponseAdapter;
|
||||
private final CvTableParsingAdapter cvTableParsingAdapter;
|
||||
private final LayoutParsingStorageService layoutParsingStorageService;
|
||||
private final PdfParsingService pdfParsingService;
|
||||
private final ClassificationService classificationService;
|
||||
private final SectionsBuilderService sectionsBuilderService;
|
||||
private final SectionGridCreatorService sectionGridCreatorService;
|
||||
private final TaasClassificationService taasClassificationService;
|
||||
private final RedactManagerClassificationService redactManagerClassificationService;
|
||||
private final DocuMineClassificationService docuMineClassificationService;
|
||||
|
||||
|
||||
public LayoutParsingFinishedEvent parseLayoutAndSaveFilesToStorage(LayoutParsingRequest layoutParsingRequest) throws IOException {
|
||||
@ -54,13 +60,17 @@ public class LayoutParsingService {
|
||||
tableServiceResponse = layoutParsingStorageService.getTablesFile(layoutParsingRequest.pageFileStorageId());
|
||||
}
|
||||
|
||||
Document documentGraph = parseLayout(originDocument, imageServiceResponse, tableServiceResponse);
|
||||
Document documentGraph = parseLayout(layoutParsingRequest.layoutParsingType(), originDocument, imageServiceResponse, tableServiceResponse);
|
||||
int numberOfPages = originDocument.getNumberOfPages();
|
||||
originDocument.close();
|
||||
|
||||
var researchDocumentData = TaasDocumentDataMapper.fromDocument(documentGraph);
|
||||
layoutParsingStorageService.storeSectionGrid(layoutParsingRequest, sectionGridCreatorService.createSectionGrid(documentGraph));
|
||||
layoutParsingStorageService.storeDocumentData(layoutParsingRequest, DocumentDataMapper.toDocumentData(documentGraph));
|
||||
|
||||
layoutParsingStorageService.storeDocumentData(layoutParsingRequest, researchDocumentData, DocumentDataMapper.toDocumentData(documentGraph));
|
||||
if (layoutParsingRequest.layoutParsingType().equals(LayoutParsingType.TAAS)) {
|
||||
var researchDocumentData = TaasDocumentDataMapper.fromDocument(documentGraph);
|
||||
layoutParsingStorageService.storeResearchDocumentData(layoutParsingRequest, researchDocumentData);
|
||||
}
|
||||
|
||||
return LayoutParsingFinishedEvent.builder()
|
||||
.identifier(layoutParsingRequest.identifier())
|
||||
@ -75,13 +85,21 @@ public class LayoutParsingService {
|
||||
}
|
||||
|
||||
|
||||
public Document parseLayout(PDDocument originDocument, ImageServiceResponse imageServiceResponse, TableServiceResponse tableServiceResponse) {
|
||||
public Document parseLayout(LayoutParsingType layoutParsingType,
|
||||
PDDocument originDocument,
|
||||
ImageServiceResponse imageServiceResponse,
|
||||
TableServiceResponse tableServiceResponse) {
|
||||
|
||||
ClassificationDocument classificationDocument = pdfParsingService.parseDocument(originDocument,
|
||||
ClassificationDocument classificationDocument = pdfParsingService.parseDocument(layoutParsingType,
|
||||
originDocument,
|
||||
cvTableParsingAdapter.buildCvParsedTablesPerPage(tableServiceResponse),
|
||||
imageServiceResponseAdapter.buildClassifiedImagesPerPage(imageServiceResponse));
|
||||
|
||||
classificationService.classifyDocument(classificationDocument);
|
||||
switch (layoutParsingType) {
|
||||
case TAAS -> taasClassificationService.classifyDocument(classificationDocument);
|
||||
case DOCUMINE -> docuMineClassificationService.classifyDocument(classificationDocument);
|
||||
case REDACT_MANAGER -> redactManagerClassificationService.classifyDocument(classificationDocument);
|
||||
}
|
||||
|
||||
sectionsBuilderService.buildSections(classificationDocument);
|
||||
|
||||
@ -89,16 +107,25 @@ public class LayoutParsingService {
|
||||
}
|
||||
|
||||
|
||||
public Document parseLayoutWithTimer(PDDocument originDocument, ImageServiceResponse imageServiceResponse, TableServiceResponse tableServiceResponse) {
|
||||
public Document parseLayoutWithTimer(LayoutParsingType layoutParsingType,
|
||||
PDDocument originDocument,
|
||||
ImageServiceResponse imageServiceResponse,
|
||||
TableServiceResponse tableServiceResponse) {
|
||||
|
||||
long start = System.currentTimeMillis();
|
||||
ClassificationDocument classificationDocument = pdfParsingService.parseDocument(originDocument,
|
||||
|
||||
ClassificationDocument classificationDocument = pdfParsingService.parseDocument(layoutParsingType, originDocument,
|
||||
cvTableParsingAdapter.buildCvParsedTablesPerPage(tableServiceResponse),
|
||||
imageServiceResponseAdapter.buildClassifiedImagesPerPage(imageServiceResponse));
|
||||
|
||||
System.out.printf("parsed %d ms", System.currentTimeMillis() - start);
|
||||
|
||||
start = System.currentTimeMillis();
|
||||
classificationService.classifyDocument(classificationDocument);
|
||||
switch (layoutParsingType) {
|
||||
case TAAS -> taasClassificationService.classifyDocument(classificationDocument);
|
||||
case DOCUMINE -> docuMineClassificationService.classifyDocument(classificationDocument);
|
||||
case REDACT_MANAGER -> redactManagerClassificationService.classifyDocument(classificationDocument);
|
||||
}
|
||||
System.out.printf(", classified %d ms", System.currentTimeMillis() - start);
|
||||
|
||||
start = System.currentTimeMillis();
|
||||
@ -13,6 +13,7 @@ import org.apache.pdfbox.pdmodel.PDDocument;
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.section.SectionGrid;
|
||||
import com.iqser.red.storage.commons.service.StorageService;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.AtomicPositionBlockData;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.AtomicTextBlockData;
|
||||
@ -68,14 +69,24 @@ public class LayoutParsingStorageService {
|
||||
}
|
||||
|
||||
|
||||
public void storeDocumentData(LayoutParsingRequest layoutParsingRequest, ResearchDocumentData researchDocumentData, DocumentData documentData) {
|
||||
public void storeDocumentData(LayoutParsingRequest layoutParsingRequest, DocumentData documentData) {
|
||||
|
||||
storageService.storeJSONObject(TenantContext.getTenantId(), layoutParsingRequest.researchDocumentStorageId(), researchDocumentData);
|
||||
storageService.storeJSONObject(TenantContext.getTenantId(), layoutParsingRequest.structureFileStorageId(), documentData.getDocumentTreeData());
|
||||
storageService.storeJSONObject(TenantContext.getTenantId(), layoutParsingRequest.textBlockFileStorageId(), documentData.getAtomicTextBlocks());
|
||||
storageService.storeJSONObject(TenantContext.getTenantId(), layoutParsingRequest.positionBlockFileStorageId(), documentData.getAtomicPositionBlocks());
|
||||
storageService.storeJSONObject(TenantContext.getTenantId(), layoutParsingRequest.pageFileStorageId(), documentData.getPages());
|
||||
}
|
||||
|
||||
|
||||
public void storeSectionGrid(LayoutParsingRequest layoutParsingRequest, SectionGrid sectionGrid) {
|
||||
|
||||
storageService.storeJSONObject(TenantContext.getTenantId(), layoutParsingRequest.sectionGridStorageId(), sectionGrid);
|
||||
}
|
||||
|
||||
|
||||
public void storeResearchDocumentData(LayoutParsingRequest layoutParsingRequest, ResearchDocumentData researchDocumentData) {
|
||||
|
||||
storageService.storeJSONObject(TenantContext.getTenantId(), layoutParsingRequest.researchDocumentStorageId(), researchDocumentData);
|
||||
}
|
||||
|
||||
|
||||
@ -88,9 +99,7 @@ public class LayoutParsingStorageService {
|
||||
AtomicPositionBlockData[] atomicPositionBlockData = storageService.readJSONObject(TenantContext.getTenantId(),
|
||||
layoutParsingRequest.positionBlockFileStorageId(),
|
||||
AtomicPositionBlockData[].class);
|
||||
DocumentTreeData tableOfContentsData = storageService.readJSONObject(TenantContext.getTenantId(),
|
||||
layoutParsingRequest.structureFileStorageId(),
|
||||
DocumentTreeData.class);
|
||||
DocumentTreeData tableOfContentsData = storageService.readJSONObject(TenantContext.getTenantId(), layoutParsingRequest.structureFileStorageId(), DocumentTreeData.class);
|
||||
|
||||
return DocumentData.builder()
|
||||
.documentTreeData(tableOfContentsData)
|
||||
|
||||
@ -10,8 +10,8 @@ import java.util.Map;
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.adapter.model.image.ImageServiceResponse;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.model.ClassificationPage;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.model.image.ClassifiedImage;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.image.ClassifiedImage;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.ImageType;
|
||||
|
||||
import lombok.RequiredArgsConstructor;
|
||||
|
||||
@ -1,8 +0,0 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.classification.model;
|
||||
|
||||
public enum Orientation {
|
||||
|
||||
NONE,
|
||||
LEFT,
|
||||
RIGHT
|
||||
}
|
||||
@ -13,13 +13,13 @@ import java.util.Map;
|
||||
import java.util.NoSuchElementException;
|
||||
import java.util.Set;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.model.AbstractPageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.model.ClassificationDocument;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.model.ClassificationFooter;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.model.ClassificationHeader;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.model.ClassificationPage;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.model.image.ClassifiedImage;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.model.text.TextPageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationDocument;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationFooter;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationHeader;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.image.ClassifiedImage;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.graph.DocumentTree;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.Document;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.Footer;
|
||||
@ -81,8 +81,9 @@ public class DocumentGraphFactory {
|
||||
|
||||
page.getMainBody().add(node);
|
||||
|
||||
List<TextPageBlock> textBlocks = new ArrayList<>(textBlocksToMerge);
|
||||
List<TextPageBlock> textBlocks = new ArrayList<>();
|
||||
textBlocks.add(originalTextBlock);
|
||||
textBlocks.addAll(textBlocksToMerge);
|
||||
AtomicTextBlock textBlock = context.textBlockFactory.buildAtomicTextBlock(TextPositionOperations.mergeAndSortTextPositionSequenceByYThenX(textBlocks), node, context, page);
|
||||
List<Integer> treeId = context.documentTree.createNewChildEntryAndReturnId(parentNode, node);
|
||||
node.setLeafTextBlock(textBlock);
|
||||
|
||||
@ -7,9 +7,9 @@ import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
import java.util.Objects;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.model.text.RedTextPosition;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.model.text.TextDirection;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.model.text.TextPositionSequence;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.RedTextPosition;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextDirection;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.graph.Boundary;
|
||||
|
||||
import lombok.experimental.UtilityClass;
|
||||
|
||||
@ -10,10 +10,10 @@ import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.model.AbstractPageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.model.image.ClassifiedImage;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.model.table.TablePageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.model.text.TextPageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.image.ClassifiedImage;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.TablePageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.GenericSemanticNode;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.Page;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.Section;
|
||||
@ -80,7 +80,7 @@ public class SectionNodeFactory {
|
||||
remainingBlocks.removeAll(alreadyMerged);
|
||||
|
||||
if (abstractPageBlock instanceof TextPageBlock) {
|
||||
List<TextPageBlock> textBlocks = findTextBlocksWithSameClassificationAndAlignsY(abstractPageBlock, remainingBlocks);
|
||||
List<TextPageBlock> textBlocks = findTextBlocksWithSameClassificationAndAlignsY((TextPageBlock) abstractPageBlock, remainingBlocks);
|
||||
alreadyMerged.addAll(textBlocks);
|
||||
DocumentGraphFactory.addParagraphOrHeadline(section, (TextPageBlock) abstractPageBlock, context, textBlocks);
|
||||
} else if (abstractPageBlock instanceof TablePageBlock tablePageBlock) {
|
||||
@ -123,7 +123,7 @@ public class SectionNodeFactory {
|
||||
List<AbstractPageBlock> previousList = splitList.get(i - 1);
|
||||
AbstractPageBlock lastPageBlockInPreviousList = previousList.get(previousList.size() - 1);
|
||||
if (lastPageBlockInPreviousList.isHeadline()) {
|
||||
previousList.remove(i - 1);
|
||||
previousList.remove(previousList.size() - 1);
|
||||
splitList.get(i).add(0, lastPageBlockInPreviousList);
|
||||
}
|
||||
}
|
||||
@ -162,7 +162,7 @@ public class SectionNodeFactory {
|
||||
}
|
||||
|
||||
|
||||
private List<TextPageBlock> findTextBlocksWithSameClassificationAndAlignsY(AbstractPageBlock atc, List<AbstractPageBlock> pageBlocks) {
|
||||
private List<TextPageBlock> findTextBlocksWithSameClassificationAndAlignsY(TextPageBlock atc, List<AbstractPageBlock> pageBlocks) {
|
||||
|
||||
return pageBlocks.stream()
|
||||
.filter(abstractTextContainer -> !abstractTextContainer.equals(atc))
|
||||
@ -170,6 +170,7 @@ public class SectionNodeFactory {
|
||||
.filter(abstractTextContainer -> abstractTextContainer instanceof TextPageBlock)
|
||||
.filter(abstractTextContainer -> abstractTextContainer.intersectsY(atc))
|
||||
.map(abstractTextContainer -> (TextPageBlock) abstractTextContainer)
|
||||
.filter(abstractTextContainer -> abstractTextContainer.getDir() == atc.getDir())
|
||||
.toList();
|
||||
}
|
||||
|
||||
|
||||
@ -7,10 +7,10 @@ import java.util.List;
|
||||
import java.util.Set;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.model.AbstractPageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.model.table.Cell;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.model.table.TablePageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.model.text.TextPositionSequence;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.Cell;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.TablePageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.GenericSemanticNode;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.Page;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.SemanticNode;
|
||||
|
||||
@ -2,7 +2,7 @@ package com.knecon.fforesight.service.layoutparser.processor.factory;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.model.text.TextPositionSequence;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.Page;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.SemanticNode;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.graph.textblock.AtomicTextBlock;
|
||||
|
||||
@ -5,6 +5,7 @@ import static java.lang.String.format;
|
||||
import java.util.Collection;
|
||||
import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
import java.util.stream.IntStream;
|
||||
|
||||
import lombok.EqualsAndHashCode;
|
||||
import lombok.Setter;
|
||||
@ -107,6 +108,10 @@ public class Boundary implements Comparable<Boundary> {
|
||||
return splitBoundaries;
|
||||
}
|
||||
|
||||
public IntStream intStream() {
|
||||
|
||||
return IntStream.range(start, end);
|
||||
}
|
||||
|
||||
public static Boundary merge(Collection<Boundary> boundaries) {
|
||||
|
||||
|
||||
@ -5,7 +5,7 @@ import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.model.ClassificationPage;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.graph.entity.RedactionEntity;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.graph.textblock.TextBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.graph.textblock.TextBlockCollector;
|
||||
|
||||
@ -18,7 +18,7 @@ import com.knecon.fforesight.service.layoutparser.processor.graph.entity.EntityT
|
||||
import com.knecon.fforesight.service.layoutparser.processor.graph.entity.RedactionEntity;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.graph.textblock.AtomicTextBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.graph.textblock.TextBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.RectangleTransformations;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations;
|
||||
|
||||
public interface SemanticNode {
|
||||
|
||||
|
||||
@ -10,7 +10,7 @@ import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.Image;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.ImageType;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.TableCell;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.Table;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.RectangleTransformations;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations;
|
||||
|
||||
public class PropertiesMapper {
|
||||
|
||||
|
||||
@ -1,8 +1,8 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.classification.model;
|
||||
package com.knecon.fforesight.service.layoutparser.processor.model;
|
||||
|
||||
import com.fasterxml.jackson.annotation.JsonIgnore;
|
||||
import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.Rectangle;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.model.text.TextPageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
|
||||
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Data;
|
||||
@ -1,11 +1,11 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.classification.model;
|
||||
package com.knecon.fforesight.service.layoutparser.processor.model;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.section.SectionGrid;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.model.text.StringFrequencyCounter;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.model.text.UnclassifiedText;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.StringFrequencyCounter;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.UnclassifiedText;
|
||||
|
||||
import lombok.Data;
|
||||
import lombok.NoArgsConstructor;
|
||||
@ -1,8 +1,8 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.classification.model;
|
||||
package com.knecon.fforesight.service.layoutparser.processor.model;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.model.text.TextPageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
|
||||
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Data;
|
||||
@ -1,8 +1,8 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.classification.model;
|
||||
package com.knecon.fforesight.service.layoutparser.processor.model;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.model.text.TextPageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
|
||||
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Data;
|
||||
@ -1,11 +1,11 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.classification.model;
|
||||
package com.knecon.fforesight.service.layoutparser.processor.model;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.Rectangle;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.model.image.ClassifiedImage;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.model.text.StringFrequencyCounter;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.image.ClassifiedImage;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.StringFrequencyCounter;
|
||||
|
||||
import lombok.Data;
|
||||
import lombok.NonNull;
|
||||
@ -1,10 +1,10 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.classification.model;
|
||||
package com.knecon.fforesight.service.layoutparser.processor.model;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.model.image.ClassifiedImage;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.model.table.TablePageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.image.ClassifiedImage;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.TablePageBlock;
|
||||
|
||||
import lombok.Data;
|
||||
import lombok.NoArgsConstructor;
|
||||
@ -1,4 +1,4 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.classification.model;
|
||||
package com.knecon.fforesight.service.layoutparser.processor.model;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collections;
|
||||
@ -0,0 +1,23 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.model;
|
||||
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
|
||||
import lombok.AccessLevel;
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Getter;
|
||||
import lombok.experimental.FieldDefaults;
|
||||
|
||||
@Getter
|
||||
@AllArgsConstructor
|
||||
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
|
||||
public class Gaps {
|
||||
List<List<Rectangle2D>> xGaps ;
|
||||
List<List<Rectangle2D>> yGaps ;
|
||||
|
||||
public Gaps() {
|
||||
xGaps = new LinkedList<>();
|
||||
yGaps = new LinkedList<>();
|
||||
}
|
||||
}
|
||||
@ -0,0 +1,5 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.model;
|
||||
|
||||
public class LineInformation {
|
||||
|
||||
}
|
||||
@ -0,0 +1,8 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.model;
|
||||
|
||||
public enum Orientation {
|
||||
|
||||
NONE,
|
||||
LEFT,
|
||||
RIGHT
|
||||
}
|
||||
@ -1,4 +1,4 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.classification.model;
|
||||
package com.knecon.fforesight.service.layoutparser.processor.model;
|
||||
|
||||
public enum PageBlockType {
|
||||
H1,
|
||||
@ -0,0 +1,20 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.model;
|
||||
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.util.List;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
|
||||
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Builder;
|
||||
import lombok.Getter;
|
||||
|
||||
@Getter
|
||||
@Builder
|
||||
@AllArgsConstructor
|
||||
public class PageInformation {
|
||||
|
||||
List<TextPositionSequence> sortedTextPositionSequences;
|
||||
Rectangle2D cropBox;
|
||||
|
||||
}
|
||||
@ -0,0 +1,5 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.model;
|
||||
|
||||
public class PageInformation {
|
||||
|
||||
}
|
||||
@ -0,0 +1,123 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.model;
|
||||
|
||||
import java.util.Collections;
|
||||
import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
import java.util.regex.Matcher;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
import lombok.AccessLevel;
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.experimental.FieldDefaults;
|
||||
|
||||
@AllArgsConstructor
|
||||
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
|
||||
public class SectionIdentifier {
|
||||
|
||||
static Pattern numericalIdentifierPattern = Pattern.compile("^[\\s]?(\\d+)[\\s.,;]?(\\d+)?[\\s.,;]?(\\d+)?[\\s.,;]?(\\d+)?");
|
||||
|
||||
private enum Format {
|
||||
EMPTY,
|
||||
NUMERICAL,
|
||||
DOCUMENT
|
||||
}
|
||||
|
||||
Format format;
|
||||
String identifierString;
|
||||
List<Integer> identifiers;
|
||||
boolean asChild;
|
||||
|
||||
|
||||
public static SectionIdentifier fromSearchText(String headline) {
|
||||
|
||||
if (headline == null || headline.isEmpty() || headline.isBlank()) {
|
||||
return SectionIdentifier.empty();
|
||||
}
|
||||
|
||||
Matcher numericalIdentifierMatcher = numericalIdentifierPattern.matcher(headline);
|
||||
if (numericalIdentifierMatcher.find()) {
|
||||
return buildNumericalSectionIdentifier(headline, numericalIdentifierMatcher);
|
||||
}
|
||||
// more formats here
|
||||
return SectionIdentifier.empty();
|
||||
}
|
||||
|
||||
|
||||
public static SectionIdentifier asChildOf(SectionIdentifier sectionIdentifier) {
|
||||
|
||||
return new SectionIdentifier(sectionIdentifier.format, sectionIdentifier.toString(), sectionIdentifier.identifiers, true);
|
||||
}
|
||||
|
||||
|
||||
public static SectionIdentifier document() {
|
||||
|
||||
return new SectionIdentifier(Format.DOCUMENT, "document", Collections.emptyList(), false);
|
||||
}
|
||||
|
||||
|
||||
public static SectionIdentifier empty() {
|
||||
|
||||
return new SectionIdentifier(Format.EMPTY, "empty", Collections.emptyList(), false);
|
||||
}
|
||||
|
||||
|
||||
private static SectionIdentifier buildNumericalSectionIdentifier(String headline, Matcher numericalIdentifierMatcher) {
|
||||
|
||||
String identifierString = headline.substring(numericalIdentifierMatcher.start(), numericalIdentifierMatcher.end());
|
||||
List<Integer> identifiers = new LinkedList<>();
|
||||
for (int i = 1; i <= 4; i++) {
|
||||
String numericalIdentifier = numericalIdentifierMatcher.group(i);
|
||||
if (numericalIdentifier == null || numericalIdentifier.equals("0") || numericalIdentifier.isEmpty() || numericalIdentifier.isBlank()) {
|
||||
break;
|
||||
}
|
||||
identifiers.add(Integer.parseInt(numericalIdentifier.trim()));
|
||||
}
|
||||
return new SectionIdentifier(Format.NUMERICAL, identifierString, identifiers.stream().toList(), false);
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Determines if the current section is the parent of the given section.
|
||||
*
|
||||
* @param sectionIdentifier The section identifier to compare against.
|
||||
* @return true if the current section is the parent of the given section, false otherwise.
|
||||
*/
|
||||
public boolean isParentOf(SectionIdentifier sectionIdentifier) {
|
||||
|
||||
if (this.format.equals(Format.EMPTY)) {
|
||||
return false;
|
||||
}
|
||||
if (this.format.equals(Format.DOCUMENT)) {
|
||||
return true;
|
||||
}
|
||||
if (!this.format.equals(sectionIdentifier.format)) {
|
||||
return false;
|
||||
}
|
||||
if (this.identifiers.size() >= sectionIdentifier.identifiers.size() && !(this.identifiers.size() == sectionIdentifier.identifiers.size() && sectionIdentifier.asChild)) {
|
||||
return false;
|
||||
}
|
||||
for (int i = 0; i < this.identifiers.size(); i++) {
|
||||
if (!this.identifiers.get(i).equals(sectionIdentifier.identifiers.get(i))) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
public boolean isChildOf(SectionIdentifier sectionIdentifier) {
|
||||
|
||||
if (this.format.equals(Format.DOCUMENT) || this.format.equals(Format.EMPTY)) {
|
||||
return false;
|
||||
}
|
||||
return sectionIdentifier.isParentOf(this);
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
|
||||
return identifierString;
|
||||
}
|
||||
|
||||
}
|
||||
@ -1,4 +1,4 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.classification.model.image;
|
||||
package com.knecon.fforesight.service.layoutparser.processor.model.image;
|
||||
|
||||
import java.awt.geom.Rectangle2D;
|
||||
|
||||
@ -1,13 +1,13 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.classification.model.table;
|
||||
package com.knecon.fforesight.service.layoutparser.processor.model.table;
|
||||
|
||||
import java.awt.geom.Point2D;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Iterator;
|
||||
import java.util.List;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.model.text.TextPageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.model.text.TextPositionSequence;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.utils.TextNormalizationUtilities;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.TextNormalizationUtilities;
|
||||
|
||||
import lombok.Data;
|
||||
import lombok.EqualsAndHashCode;
|
||||
@ -1,4 +1,4 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.classification.model.table;
|
||||
package com.knecon.fforesight.service.layoutparser.processor.model.table;
|
||||
|
||||
import lombok.RequiredArgsConstructor;
|
||||
import lombok.Value;
|
||||
@ -1,4 +1,4 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.classification.model.table;
|
||||
package com.knecon.fforesight.service.layoutparser.processor.model.table;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
@ -1,4 +1,4 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.classification.model.table;
|
||||
package com.knecon.fforesight.service.layoutparser.processor.model.table;
|
||||
|
||||
import java.awt.geom.Point2D;
|
||||
import java.awt.geom.Rectangle2D;
|
||||
@ -1,4 +1,4 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.classification.model.table;
|
||||
package com.knecon.fforesight.service.layoutparser.processor.model.table;
|
||||
|
||||
import java.awt.geom.Line2D;
|
||||
import java.awt.geom.Point2D;
|
||||
@ -11,8 +11,8 @@ import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.TreeMap;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.utils.CohenSutherlandClipping;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.utils.DoubleComparisons;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.CohenSutherlandClipping;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.DoubleComparisons;
|
||||
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
|
||||
@ -1,4 +1,4 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.classification.model.table;
|
||||
package com.knecon.fforesight.service.layoutparser.processor.model.table;
|
||||
|
||||
import java.awt.geom.Point2D;
|
||||
import java.util.ArrayList;
|
||||
@ -8,9 +8,9 @@ import java.util.List;
|
||||
import java.util.Set;
|
||||
import java.util.TreeMap;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.model.AbstractPageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.model.PageBlockType;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.model.text.TextPageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.PageBlockType;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
|
||||
|
||||
import lombok.Getter;
|
||||
import lombok.Setter;
|
||||
@ -1,4 +1,4 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.classification.model.text;
|
||||
package com.knecon.fforesight.service.layoutparser.processor.model.text;
|
||||
|
||||
import org.apache.pdfbox.text.TextPosition;
|
||||
import org.springframework.beans.BeanUtils;
|
||||
@ -1,8 +1,8 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.classification.model.text;
|
||||
package com.knecon.fforesight.service.layoutparser.processor.model.text;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.utils.TextNormalizationUtilities;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.TextNormalizationUtilities;
|
||||
|
||||
import lombok.Getter;
|
||||
|
||||
@ -1,4 +1,4 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.classification.model.text;
|
||||
package com.knecon.fforesight.service.layoutparser.processor.model.text;
|
||||
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Builder;
|
||||
@ -1,4 +1,4 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.classification.model.text;
|
||||
package com.knecon.fforesight.service.layoutparser.processor.model.text;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
@ -1,4 +1,4 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.classification.model.text;
|
||||
package com.knecon.fforesight.service.layoutparser.processor.model.text;
|
||||
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
@ -1,4 +1,4 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.classification.model.text;
|
||||
package com.knecon.fforesight.service.layoutparser.processor.model.text;
|
||||
|
||||
import com.fasterxml.jackson.annotation.JsonCreator;
|
||||
import com.fasterxml.jackson.annotation.JsonValue;
|
||||
@ -1,4 +1,4 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.classification.model.text;
|
||||
package com.knecon.fforesight.service.layoutparser.processor.model.text;
|
||||
|
||||
import static java.util.stream.Collectors.toSet;
|
||||
|
||||
@ -7,11 +7,11 @@ import java.util.Comparator;
|
||||
import java.util.List;
|
||||
|
||||
import com.fasterxml.jackson.annotation.JsonIgnore;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.model.AbstractPageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.model.FloatFrequencyCounter;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.model.PageBlockType;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.utils.DoubleComparisons;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.utils.TextNormalizationUtilities;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.FloatFrequencyCounter;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.PageBlockType;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.DoubleComparisons;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.TextNormalizationUtilities;
|
||||
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Builder;
|
||||
@ -1,4 +1,4 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.classification.model.text;
|
||||
package com.knecon.fforesight.service.layoutparser.processor.model.text;
|
||||
|
||||
import java.awt.geom.AffineTransform;
|
||||
import java.awt.geom.Point2D;
|
||||
@ -1,4 +1,4 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.classification.model.text;
|
||||
package com.knecon.fforesight.service.layoutparser.processor.model.text;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
@ -14,7 +14,7 @@
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package com.knecon.fforesight.service.layoutparser.processor.classification.parsing;
|
||||
package com.knecon.fforesight.service.layoutparser.processor.parsing;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
@ -1,4 +1,4 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.classification.parsing;
|
||||
package com.knecon.fforesight.service.layoutparser.processor.parsing;
|
||||
|
||||
import java.awt.color.CMMException;
|
||||
import java.awt.geom.Point2D;
|
||||
@ -35,9 +35,9 @@ import org.apache.pdfbox.pdmodel.PDPage;
|
||||
import org.apache.pdfbox.pdmodel.graphics.color.PDColor;
|
||||
import org.apache.pdfbox.text.TextPosition;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.model.table.Ruling;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.model.text.RedTextPosition;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.model.text.TextPositionSequence;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.RedTextPosition;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
|
||||
|
||||
import lombok.Getter;
|
||||
import lombok.Setter;
|
||||
@ -14,7 +14,7 @@
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package com.knecon.fforesight.service.layoutparser.processor.classification.parsing;
|
||||
package com.knecon.fforesight.service.layoutparser.processor.parsing;
|
||||
|
||||
import java.io.BufferedInputStream;
|
||||
import java.io.IOException;
|
||||
@ -10,11 +10,10 @@ import org.springframework.amqp.rabbit.annotation.RabbitListener;
|
||||
import org.springframework.amqp.rabbit.core.RabbitTemplate;
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import com.fasterxml.jackson.core.JsonProcessingException;
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingFinishedEvent;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingRequest;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.LayoutParsingService;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.LayoutParsingPipeline;
|
||||
|
||||
import lombok.RequiredArgsConstructor;
|
||||
import lombok.SneakyThrows;
|
||||
@ -25,7 +24,7 @@ import lombok.extern.slf4j.Slf4j;
|
||||
@RequiredArgsConstructor
|
||||
public class MessageHandler {
|
||||
|
||||
private final LayoutParsingService layoutParsingService;
|
||||
private final LayoutParsingPipeline layoutParsingPipeline;
|
||||
private final ObjectMapper objectMapper;
|
||||
private final RabbitTemplate rabbitTemplate;
|
||||
|
||||
@ -42,7 +41,7 @@ public class MessageHandler {
|
||||
throw new AmqpRejectAndDontRequeueException(String.format("Error during last layout parsing of request with identifier: %s, do not retry.",
|
||||
layoutParsingRequest.identifier()));
|
||||
}
|
||||
LayoutParsingFinishedEvent layoutParsingFinishedEvent = layoutParsingService.parseLayoutAndSaveFilesToStorage(layoutParsingRequest);
|
||||
LayoutParsingFinishedEvent layoutParsingFinishedEvent = layoutParsingPipeline.parseLayoutAndSaveFilesToStorage(layoutParsingRequest);
|
||||
sendLayoutParsingFinishedEvent(layoutParsingFinishedEvent);
|
||||
log.info("Layout parsing finished {} in {} ms", layoutParsingRequest.identifier(), layoutParsingFinishedEvent.duration());
|
||||
}
|
||||
|
||||
@ -1,4 +1,4 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.classification.service;
|
||||
package com.knecon.fforesight.service.layoutparser.processor.services;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
@ -6,13 +6,13 @@ import org.springframework.stereotype.Service;
|
||||
|
||||
import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.Point;
|
||||
import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.Rectangle;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.model.AbstractPageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.model.ClassificationPage;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.model.FloatFrequencyCounter;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.model.table.Cell;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.model.table.TablePageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.model.text.TextPageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.utils.PositionUtils;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.FloatFrequencyCounter;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.Cell;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.TablePageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.PositionUtils;
|
||||
|
||||
@Service
|
||||
public class BodyTextFrameService {
|
||||
@ -0,0 +1,149 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.services;
|
||||
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.util.Collection;
|
||||
import java.util.Comparator;
|
||||
import java.util.HashMap;
|
||||
import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.stream.IntStream;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
|
||||
|
||||
import lombok.experimental.UtilityClass;
|
||||
|
||||
@UtilityClass
|
||||
public class DividingColumnDetectionService {
|
||||
|
||||
private static final double SPLITTABLE_LINE_PERCENTAGE_THRESHOLD = 0.6;
|
||||
private static final int MAX_NUMBER_OF_COLUMNS = 4;
|
||||
|
||||
|
||||
public List<Rectangle2D> detectColumns(List<TextPositionSequence> textPositionSequences, Rectangle2D mainBodyTextFrame) {
|
||||
|
||||
if (textPositionSequences.size() < 2) {
|
||||
return List.of(mainBodyTextFrame);
|
||||
}
|
||||
|
||||
List<List<Rectangle2D>> linesWithGaps = LineDetectionService.findTextBlockInLines(textPositionSequences);
|
||||
|
||||
Map<Integer, List<Integer>> linesWithMatchingGapIndices = new HashMap<>();
|
||||
for (int numberOfColumns = 2; numberOfColumns <= MAX_NUMBER_OF_COLUMNS; numberOfColumns++) {
|
||||
linesWithMatchingGapIndices.put(numberOfColumns, findConsecutiveLinesWithMatchingGaps(linesWithGaps, mainBodyTextFrame.getWidth(), numberOfColumns));
|
||||
}
|
||||
|
||||
int optimalNumberOfColumns = findOptimalNumberOfColumns(linesWithMatchingGapIndices, linesWithGaps.size());
|
||||
if (optimalNumberOfColumns == 1) {
|
||||
return List.of(mainBodyTextFrame);
|
||||
}
|
||||
return buildColumns(mainBodyTextFrame, getLinesWithMatchingGaps(linesWithMatchingGapIndices.get(optimalNumberOfColumns), linesWithGaps), optimalNumberOfColumns);
|
||||
}
|
||||
|
||||
|
||||
private static List<Integer> findConsecutiveLinesWithMatchingGaps(List<List<Rectangle2D>> linesWithGaps, double width, int numberOfColumns) {
|
||||
|
||||
List<Boolean> booleans = lineHasMatchingGap(linesWithGaps, width, numberOfColumns);
|
||||
return findConsecutiveTrueIndicesWithMaxLengthRun(booleans);
|
||||
}
|
||||
|
||||
|
||||
private List<Boolean> lineHasMatchingGap(List<List<Rectangle2D>> linesWithGaps, double width, int numberOfColumns) {
|
||||
|
||||
return linesWithGaps.stream()
|
||||
.map(blocksWithGaps -> IntStream.range(1, numberOfColumns)
|
||||
.allMatch(columnIndex -> noBlocksIntersectX(blocksWithGaps, calculateGapLocation(width, numberOfColumns, columnIndex))))
|
||||
.toList();
|
||||
}
|
||||
|
||||
|
||||
private List<Integer> findConsecutiveTrueIndicesWithMaxLengthRun(List<Boolean> booleans) {
|
||||
|
||||
List<Integer> maxConsecutiveTrueIndices = new LinkedList<>();
|
||||
List<Integer> currentConsecutiveTrueIndices = new LinkedList<>();
|
||||
for (int i = 0; i < booleans.size(); i++) {
|
||||
if (!booleans.get(i)) {
|
||||
if (currentConsecutiveTrueIndices.isEmpty()) {
|
||||
continue;
|
||||
}
|
||||
if (currentConsecutiveTrueIndices.size() > maxConsecutiveTrueIndices.size()) {
|
||||
maxConsecutiveTrueIndices = currentConsecutiveTrueIndices;
|
||||
}
|
||||
currentConsecutiveTrueIndices = new LinkedList<>();
|
||||
continue;
|
||||
}
|
||||
currentConsecutiveTrueIndices.add(i);
|
||||
}
|
||||
if (currentConsecutiveTrueIndices.size() > maxConsecutiveTrueIndices.size()) {
|
||||
return currentConsecutiveTrueIndices;
|
||||
}
|
||||
return maxConsecutiveTrueIndices;
|
||||
}
|
||||
|
||||
|
||||
private static int findOptimalNumberOfColumns(Map<Integer, List<Integer>> linesWithMatchingGapIndices, Integer numberOfLines) {
|
||||
|
||||
return linesWithMatchingGapIndices.entrySet()
|
||||
.stream()
|
||||
.max(comparePercentages(numberOfLines))
|
||||
.filter(entry -> percentageIsAboveThreshold(entry, numberOfLines))
|
||||
.map(Map.Entry::getKey)
|
||||
.orElse(1);
|
||||
}
|
||||
|
||||
|
||||
private List<Rectangle2D> buildColumns(Rectangle2D mainBodyTextFrame, List<Rectangle2D> rectanglesToMerge, int optimalColumnCount) {
|
||||
|
||||
if (optimalColumnCount == 1 || rectanglesToMerge.isEmpty()) {
|
||||
return List.of(mainBodyTextFrame);
|
||||
}
|
||||
|
||||
double maxY = rectanglesToMerge.get(0).getMaxY();
|
||||
double minY = rectanglesToMerge.get(rectanglesToMerge.size() - 1).getMinY();
|
||||
|
||||
List<Rectangle2D> columns = new LinkedList<>();
|
||||
double width = mainBodyTextFrame.getWidth() / optimalColumnCount;
|
||||
double height = maxY - minY;
|
||||
for (int i = 0; i < optimalColumnCount; i++) {
|
||||
columns.add(new Rectangle2D.Double(mainBodyTextFrame.getMinY() + i * width, minY, width, height));
|
||||
}
|
||||
return columns;
|
||||
}
|
||||
|
||||
|
||||
private Comparator<Map.Entry<Integer, List<Integer>>> comparePercentages(Integer numberOfLines) {
|
||||
|
||||
return Comparator.comparingDouble(entry -> calculatePercentage(entry.getValue().size(), numberOfLines));
|
||||
}
|
||||
|
||||
|
||||
private List<Rectangle2D> getLinesWithMatchingGaps(List<Integer> linesWithMatchingGapIndices, List<List<Rectangle2D>> linesWithGaps) {
|
||||
|
||||
return linesWithMatchingGapIndices.stream().map(linesWithGaps::get).flatMap(Collection::stream).toList();
|
||||
}
|
||||
|
||||
|
||||
private boolean percentageIsAboveThreshold(Map.Entry<Integer, List<Integer>> entry, Integer numberOfLines) {
|
||||
|
||||
return calculatePercentage(entry.getValue().size(), numberOfLines) > SPLITTABLE_LINE_PERCENTAGE_THRESHOLD;
|
||||
}
|
||||
|
||||
|
||||
private double calculatePercentage(Integer numberOfMatchingLines, Integer numberOfLines) {
|
||||
|
||||
return ((double) numberOfMatchingLines) / ((double) numberOfLines);
|
||||
}
|
||||
|
||||
|
||||
private double calculateGapLocation(double pageWidth, int numberOfColumns, int columnIndex) {
|
||||
|
||||
return (pageWidth / numberOfColumns) * columnIndex;
|
||||
}
|
||||
|
||||
|
||||
private Boolean noBlocksIntersectX(List<Rectangle2D> blocksWithGaps, double x) {
|
||||
|
||||
return blocksWithGaps.stream().noneMatch(rect -> rect.getMaxX() > x && rect.getMinX() < x);
|
||||
}
|
||||
|
||||
}
|
||||
@ -0,0 +1,169 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.services;
|
||||
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
|
||||
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.experimental.UtilityClass;
|
||||
|
||||
@UtilityClass
|
||||
public class GapDetectionService {
|
||||
|
||||
private static final double X_GAP_FACTOR = 0.3; // multiplied with average text height, determines the minimum distance of gaps in lines
|
||||
private static final double Y_GAP_FACTOR = 1;
|
||||
private static final double NEW_LINE_FACTOR = 0.2;
|
||||
|
||||
|
||||
public static Gaps findGapsInLines(List<TextPositionSequence> sortedTextPositionSequences, Rectangle2D mainBodyTextFrame) {
|
||||
|
||||
if (sortedTextPositionSequences.isEmpty()) {
|
||||
return new Gaps();
|
||||
}
|
||||
//assertAllTextPositionsHaveSameDir(textPositionSequences);
|
||||
|
||||
final double avgTextPositionHeight = getAvgTextPositionHeight(sortedTextPositionSequences);
|
||||
|
||||
XGapsContext xGapContext = XGapsContext.init(mainBodyTextFrame);
|
||||
YGapsContext yGapContext = YGapsContext.init(mainBodyTextFrame);
|
||||
|
||||
var previousTextPosition = sortedTextPositionSequences.get(0);
|
||||
Rectangle2D rectangle = toRectangle2D(previousTextPosition);
|
||||
|
||||
yGapContext.addGapFromTopOfMainBody(rectangle);
|
||||
xGapContext.addGapFromLeftEdgeOfMainBody(rectangle);
|
||||
|
||||
for (TextPositionSequence currentTextPosition : sortedTextPositionSequences.subList(1, sortedTextPositionSequences.size())) {
|
||||
|
||||
double yGap = Math.abs(currentTextPosition.getMaxYDirAdj() - previousTextPosition.getMaxYDirAdj());
|
||||
double xGap = Math.abs(previousTextPosition.getMaxXDirAdj() - currentTextPosition.getMinXDirAdj());
|
||||
Rectangle2D previousTextPositionBBox = toRectangle2D(previousTextPosition);
|
||||
Rectangle2D currentTextPositionBBox = toRectangle2D(currentTextPosition);
|
||||
|
||||
if (yGap > avgTextPositionHeight * Y_GAP_FACTOR) {
|
||||
|
||||
yGapContext.addGap(mainBodyTextFrame.getMinX(), currentTextPositionBBox.getMaxY(), mainBodyTextFrame.getWidth(), yGap);
|
||||
|
||||
}
|
||||
if (yGap > avgTextPositionHeight * NEW_LINE_FACTOR) {
|
||||
|
||||
xGapContext.addGapToRightEdgeOfMainBody(previousTextPositionBBox);
|
||||
xGapContext.gapsInCurrentLine = new LinkedList<>();
|
||||
xGapContext.gapsPerLine.add(xGapContext.gapsInCurrentLine);
|
||||
xGapContext.addGapFromLeftEdgeOfMainBody(currentTextPositionBBox);
|
||||
|
||||
|
||||
} else if (xGap <= avgTextPositionHeight * X_GAP_FACTOR) {
|
||||
addGapToLine(currentTextPositionBBox, previousTextPositionBBox, xGapContext);
|
||||
}
|
||||
previousTextPosition = currentTextPosition;
|
||||
}
|
||||
xGapContext.addGapToRightEdgeOfMainBody(toRectangle2D(sortedTextPositionSequences.get(sortedTextPositionSequences.size() - 1)));
|
||||
xGapContext.gapsPerLine.add(xGapContext.gapsInCurrentLine);
|
||||
|
||||
return new Gaps(xGapContext.gapsPerLine, yGapContext.gapsPerLine);
|
||||
}
|
||||
|
||||
|
||||
private static Rectangle2D toRectangle2D(TextPositionSequence textPosition) {
|
||||
|
||||
return RectangleTransformations.toRectangle2D(textPosition.getRectangle());
|
||||
}
|
||||
|
||||
|
||||
private static void addGapToLine(Rectangle2D currentTextPosition, Rectangle2D previousTextPosition, XGapsContext context) {
|
||||
|
||||
context.gapsInCurrentLine.add(new Rectangle2D.Double(previousTextPosition.getMaxX(),
|
||||
previousTextPosition.getMinY(),
|
||||
currentTextPosition.getMinX() - previousTextPosition.getMaxX(),
|
||||
(previousTextPosition.getHeight() + currentTextPosition.getHeight()) / 2));
|
||||
}
|
||||
|
||||
|
||||
private static void assertAllTextPositionsHaveSameDir(List<TextPositionSequence> textPositionSequences) {
|
||||
|
||||
assert textPositionSequences.stream().map(TextPositionSequence::getDir).allMatch(a -> a.equals(textPositionSequences.get(0).getDir()));
|
||||
}
|
||||
|
||||
|
||||
private static double getAvgTextPositionHeight(List<TextPositionSequence> textPositionSequences) {
|
||||
|
||||
return textPositionSequences.stream().mapToDouble(TextPositionSequence::getHeight).average().orElseThrow();
|
||||
}
|
||||
|
||||
|
||||
@AllArgsConstructor
|
||||
private static class YGapsContext {
|
||||
|
||||
List<List<Rectangle2D>> gapsPerLine;
|
||||
List<Rectangle2D> gapsInCurrentLine;
|
||||
Rectangle2D mainBodyTextFrame;
|
||||
|
||||
|
||||
public static YGapsContext init(Rectangle2D mainBodyTextFrame) {
|
||||
|
||||
List<List<Rectangle2D>> initialLinesWithGaps = new LinkedList<>();
|
||||
List<Rectangle2D> initialBlocksInLine = new LinkedList<>();
|
||||
initialLinesWithGaps.add(initialBlocksInLine);
|
||||
return new YGapsContext(initialLinesWithGaps, initialBlocksInLine, mainBodyTextFrame);
|
||||
}
|
||||
|
||||
|
||||
public void addGapFromTopOfMainBody(Rectangle2D rectangle) {
|
||||
|
||||
gapsInCurrentLine.add(new Rectangle2D.Double(mainBodyTextFrame.getMinX(),
|
||||
rectangle.getMaxY(),
|
||||
mainBodyTextFrame.getWidth(),
|
||||
mainBodyTextFrame.getMaxY() - rectangle.getMaxY()));
|
||||
}
|
||||
|
||||
|
||||
|
||||
public void addGap(double x, double y, double w, double h) {
|
||||
|
||||
gapsInCurrentLine.add(new Rectangle2D.Double(x, y, w, h));
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@AllArgsConstructor
|
||||
private static class XGapsContext {
|
||||
|
||||
List<List<Rectangle2D>> gapsPerLine;
|
||||
List<Rectangle2D> gapsInCurrentLine;
|
||||
Rectangle2D mainBodyTextFrame;
|
||||
|
||||
|
||||
public static XGapsContext init(Rectangle2D mainBodyTextFrame) {
|
||||
|
||||
List<List<Rectangle2D>> initialLinesWithGaps = new LinkedList<>();
|
||||
List<Rectangle2D> initialBlocksInLine = new LinkedList<>();
|
||||
initialLinesWithGaps.add(initialBlocksInLine);
|
||||
return new XGapsContext(initialLinesWithGaps, initialBlocksInLine, mainBodyTextFrame);
|
||||
}
|
||||
|
||||
|
||||
public void addGapToRightEdgeOfMainBody(Rectangle2D textPosition) {
|
||||
|
||||
Rectangle2D leftGap = new Rectangle2D.Double(textPosition.getMaxX(),
|
||||
textPosition.getMinY(),
|
||||
mainBodyTextFrame.getMaxX() - textPosition.getMaxX(),
|
||||
textPosition.getHeight());
|
||||
gapsInCurrentLine.add(leftGap);
|
||||
}
|
||||
|
||||
|
||||
public void addGapFromLeftEdgeOfMainBody(Rectangle2D textPosition) {
|
||||
|
||||
Rectangle2D leftGap = new Rectangle2D.Double(mainBodyTextFrame.getMinX(),
|
||||
textPosition.getMinY(),
|
||||
textPosition.getMinX() - mainBodyTextFrame.getMinX(),
|
||||
textPosition.getHeight());
|
||||
gapsInCurrentLine.add(leftGap);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
@ -0,0 +1,199 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.services;
|
||||
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.awt.geom.RectangularShape;
|
||||
import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
import java.util.Queue;
|
||||
import java.util.stream.Stream;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.GapInformation;
|
||||
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Getter;
|
||||
import lombok.RequiredArgsConstructor;
|
||||
import lombok.experimental.UtilityClass;
|
||||
|
||||
@UtilityClass
|
||||
public class GapFindingColumnDetectionService implements ColumnDetectionService {
|
||||
|
||||
private static final double GAP_WIDTH_THRESHOLD_FACTOR = 0.01; // multiplied with avg text height
|
||||
private static final double LINE_COUNT_THRESHOLD_FACTOR = 0.3; // multiplied with average line count per page
|
||||
|
||||
|
||||
public List<Rectangle2D> detectColumns(GapInformation gapInformation, Rectangle2D mainBodyTextFrame) {
|
||||
|
||||
if (gapInformation.getXGaps().size() < 2) {
|
||||
return List.of(mainBodyTextFrame);
|
||||
}
|
||||
double avgHeight = gapInformation.getXGaps()
|
||||
.stream()
|
||||
.filter(gaps -> !gaps.isEmpty())
|
||||
.map(gaps -> gaps.get(0))
|
||||
.mapToDouble(RectangularShape::getHeight)
|
||||
.average()
|
||||
.orElseThrow();
|
||||
|
||||
ColumnFactory columnFactory = ColumnFactory.init(avgHeight, gapInformation.getXGaps().size());
|
||||
gapInformation.getXGaps().get(0).stream().map(Column::new).forEach(columnFactory::addToQueue);
|
||||
List<List<Rectangle2D>> xGaps = gapInformation.getXGaps();
|
||||
for (var gaps : xGaps.subList(1, xGaps.size())) {
|
||||
|
||||
while (columnFactory.hasColumnsToProcess()) {
|
||||
Column column = columnFactory.getNext();
|
||||
rememberColumnIfValid(columnFactory, column);
|
||||
elongateColumnsAndFilterForWidth(column, gaps, columnFactory).forEach(columnFactory::setToStillInProgress);
|
||||
}
|
||||
columnFactory.addStillInProgressToQueue();
|
||||
columnFactory.addGapsToQueue(gaps);
|
||||
}
|
||||
|
||||
return columnFactory.outputColumns.stream()
|
||||
.filter(column -> columnFactory.outputColumns.stream().filter(column::intersectsX).noneMatch(column1 -> column1.lineCount > column.lineCount))
|
||||
.filter(column -> )
|
||||
.map(Column::getRectangle2D)
|
||||
.toList();
|
||||
}
|
||||
|
||||
|
||||
private static void rememberColumnIfValid(ColumnFactory columnFactory, Column column) {
|
||||
|
||||
if (column.lineCount >= LINE_COUNT_THRESHOLD_FACTOR * (double) columnFactory.lineCount) {
|
||||
columnFactory.outputColumns.add(column);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
private static Stream<Column> elongateColumnsAndFilterForWidth(Column column, List<Rectangle2D> gaps, ColumnFactory columnFactory) {
|
||||
|
||||
return gaps.stream()//
|
||||
.filter(gap -> column.getIntersectionWidth(gap) > GAP_WIDTH_THRESHOLD_FACTOR * columnFactory.avgHeight)//
|
||||
.map(column::addNewLineAndShrink);
|
||||
|
||||
}
|
||||
|
||||
|
||||
private static Rectangle2D correctRectangle(Rectangle2D rectangle2D) {
|
||||
|
||||
double minX = Math.min(rectangle2D.getMinX(), rectangle2D.getMaxX());
|
||||
double minY = Math.min(rectangle2D.getMinY(), rectangle2D.getMaxY());
|
||||
double maxX = Math.max(rectangle2D.getMinX(), rectangle2D.getMaxX());
|
||||
double maxY = Math.max(rectangle2D.getMinY(), rectangle2D.getMaxY());
|
||||
return new Rectangle2D.Double(minX, minY, maxX - minX, maxY - minY);
|
||||
}
|
||||
|
||||
|
||||
@Getter
|
||||
@AllArgsConstructor
|
||||
private class Column {
|
||||
|
||||
Rectangle2D rectangle2D;
|
||||
int lineCount = 1;
|
||||
|
||||
|
||||
public Column(Rectangle2D rectangle2D) {
|
||||
|
||||
this.rectangle2D = correctRectangle(rectangle2D);
|
||||
}
|
||||
|
||||
|
||||
public boolean intersectsX(Rectangle2D rectangle2D) {
|
||||
|
||||
return rectangle2D.getMinX() < this.rectangle2D.getMaxX() && this.rectangle2D.getMinX() < rectangle2D.getMaxX();
|
||||
}
|
||||
|
||||
|
||||
public boolean intersectsX(Column column) {
|
||||
|
||||
return this.intersectsX(column.getRectangle2D());
|
||||
}
|
||||
|
||||
|
||||
public double getIntersectionWidth(Rectangle2D rectangle2D) {
|
||||
|
||||
if (!intersectsX(rectangle2D)) {
|
||||
return -1;
|
||||
}
|
||||
double min_x = Math.max(rectangle2D.getMinX(), this.rectangle2D.getMinX());
|
||||
double max_x = Math.min(rectangle2D.getMaxX(), this.rectangle2D.getMaxX());
|
||||
return max_x - min_x;
|
||||
}
|
||||
|
||||
|
||||
public Column addNewLineAndShrink(Rectangle2D rectangle2D) {
|
||||
|
||||
var correctedRectangle = correctRectangle(rectangle2D);
|
||||
double min_x = Math.max(correctedRectangle.getMinX(), this.rectangle2D.getMinX());
|
||||
double max_x = Math.min(correctedRectangle.getMaxX(), this.rectangle2D.getMaxX());
|
||||
double min_y = correctedRectangle.getMinY();
|
||||
double max_y = this.rectangle2D.getMaxY();
|
||||
double width = max_x - min_x;
|
||||
double height = max_y - min_y;
|
||||
return new Column(new Rectangle2D.Double(min_x, min_y, width, height), lineCount + 1);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@RequiredArgsConstructor
|
||||
private class ColumnFactory {
|
||||
|
||||
final double avgHeight;
|
||||
final int lineCount;
|
||||
|
||||
List<Column> outputColumns = new LinkedList<>();
|
||||
Queue<Column> columnQueue = new LinkedList<>();
|
||||
List<Column> columnsToQueue = new LinkedList<>();
|
||||
|
||||
|
||||
public static ColumnFactory init(double avgHeight, int lineCount) {
|
||||
|
||||
return new ColumnFactory(Math.abs(avgHeight), lineCount);
|
||||
}
|
||||
|
||||
|
||||
public Column getNext() {
|
||||
|
||||
return columnQueue.remove();
|
||||
}
|
||||
|
||||
|
||||
public void addToQueue(Column column) {
|
||||
|
||||
columnQueue.add(column);
|
||||
}
|
||||
|
||||
|
||||
public void addToQueue(Rectangle2D gap) {
|
||||
|
||||
columnQueue.add(new Column(gap));
|
||||
}
|
||||
|
||||
|
||||
private boolean hasColumnsToProcess() {
|
||||
|
||||
return columnQueue.peek() != null;
|
||||
}
|
||||
|
||||
|
||||
public void setToStillInProgress(Column column) {
|
||||
|
||||
columnsToQueue.add(column);
|
||||
}
|
||||
|
||||
|
||||
private void addStillInProgressToQueue() {
|
||||
|
||||
for (int i = columnsToQueue.size() - 1; i >= 0; i--) {
|
||||
columnQueue.add(columnsToQueue.remove(i));
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
public void addGapsToQueue(List<Rectangle2D> gaps) {
|
||||
|
||||
gaps.forEach(this::addToQueue);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
@ -0,0 +1,5 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.services;
|
||||
|
||||
public class InvisibleTableDetectionService {
|
||||
|
||||
}
|
||||
@ -0,0 +1,122 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.services;
|
||||
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.util.Collections;
|
||||
import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.TextPositionSequenceComparator;
|
||||
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.experimental.UtilityClass;
|
||||
|
||||
@UtilityClass
|
||||
public class LineDetectionService {
|
||||
|
||||
private static final double X_GAP_FACTOR = 1; // multiplied with average text height, determines the minimum distance of gaps in lines
|
||||
|
||||
|
||||
public static List<List<Rectangle2D>> findTextBlockInLines(List<TextPositionSequence> textPositionSequences) {
|
||||
|
||||
if (textPositionSequences.isEmpty()) {
|
||||
return Collections.emptyList();
|
||||
}
|
||||
|
||||
final double avgTextPositionHeight = getAvgTextPositionHeight(textPositionSequences);
|
||||
|
||||
TextBlockContext context = TextBlockContext.init();
|
||||
|
||||
List<TextPositionSequence> sortedTextPositionSequence = textPositionSequences.stream().sorted(new TextPositionSequenceComparator()).toList();
|
||||
|
||||
var previousTextPosition = sortedTextPositionSequence.get(0);
|
||||
context.textPositionsToMerge.add(previousTextPosition);
|
||||
for (TextPositionSequence currentTextPosition : sortedTextPositionSequence.subList(1, sortedTextPositionSequence.size())) {
|
||||
if (isNewLine(currentTextPosition, previousTextPosition, avgTextPositionHeight) || isSplitByOrientation(currentTextPosition, previousTextPosition)) {
|
||||
addBlockToLine(context);
|
||||
startNewLine(currentTextPosition, context);
|
||||
} else if (isXGap(currentTextPosition, previousTextPosition, avgTextPositionHeight)) {
|
||||
addBlockToLine(context);
|
||||
startNewBlock(currentTextPosition, context);
|
||||
} else {
|
||||
context.textPositionsToMerge.add(currentTextPosition);
|
||||
}
|
||||
previousTextPosition = currentTextPosition;
|
||||
}
|
||||
addBlockToLine(context);
|
||||
return context.textBlocksInLines;
|
||||
}
|
||||
|
||||
|
||||
|
||||
private static double getAvgTextPositionHeight(List<TextPositionSequence> textPositionSequences) {
|
||||
|
||||
return textPositionSequences.stream().mapToDouble(TextPositionSequence::getHeight).average().orElseThrow();
|
||||
}
|
||||
|
||||
|
||||
private static boolean isXGap(TextPositionSequence currentTextPosition, TextPositionSequence previousTextPosition, double avgTextPositionHeight) {
|
||||
|
||||
return Math.abs(previousTextPosition.getMaxXDirAdj() - currentTextPosition.getMinXDirAdj()) > (avgTextPositionHeight * X_GAP_FACTOR);
|
||||
}
|
||||
|
||||
|
||||
private static boolean isSplitByOrientation(TextPositionSequence currentTextPosition, TextPositionSequence previousTextPosition) {
|
||||
|
||||
return !previousTextPosition.getDir().equals(currentTextPosition.getDir());
|
||||
}
|
||||
|
||||
|
||||
private static boolean isNewLine(TextPositionSequence currentTextPosition, TextPositionSequence previousTextPosition, double avgTextPositionHeight) {
|
||||
|
||||
return Math.abs(previousTextPosition.getMinYDirAdj() - currentTextPosition.getMinYDirAdj()) > avgTextPositionHeight;
|
||||
}
|
||||
|
||||
|
||||
private static void startNewBlock(TextPositionSequence currentTextPosition, TextBlockContext context) {
|
||||
|
||||
context.textPositionsToMerge = new LinkedList<>();
|
||||
context.textPositionsToMerge.add(currentTextPosition);
|
||||
}
|
||||
|
||||
|
||||
private static void addBlockToLine(TextBlockContext context) {
|
||||
|
||||
context.blocksInCurrentLine.add(textPositionBBox(context.textPositionsToMerge));
|
||||
}
|
||||
|
||||
|
||||
private static void startNewLine(TextPositionSequence current, TextBlockContext context) {
|
||||
|
||||
context.blocksInCurrentLine = new LinkedList<>();
|
||||
startNewBlock(current, context);
|
||||
context.textBlocksInLines.add(context.blocksInCurrentLine);
|
||||
}
|
||||
|
||||
|
||||
private Rectangle2D textPositionBBox(List<TextPositionSequence> textPositionSequences) {
|
||||
|
||||
return RectangleTransformations.rectangleBBox(textPositionSequences.stream().map(TextPositionSequence::getRectangle).toList());
|
||||
}
|
||||
|
||||
|
||||
@AllArgsConstructor
|
||||
private class TextBlockContext {
|
||||
|
||||
List<List<Rectangle2D>> textBlocksInLines;
|
||||
List<Rectangle2D> blocksInCurrentLine;
|
||||
List<TextPositionSequence> textPositionsToMerge;
|
||||
|
||||
|
||||
public static TextBlockContext init() {
|
||||
|
||||
List<List<Rectangle2D>> initialLinesWithGaps = new LinkedList<>();
|
||||
List<Rectangle2D> initialBlocksInLine = new LinkedList<>();
|
||||
initialLinesWithGaps.add(initialBlocksInLine);
|
||||
return new TextBlockContext(initialLinesWithGaps, initialBlocksInLine, new LinkedList<>());
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
@ -0,0 +1,5 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.services;
|
||||
|
||||
public class MainBodyTextFrameExtractionService {
|
||||
|
||||
}
|
||||
@ -0,0 +1,2 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.services;public class PageInformationService {
|
||||
}
|
||||
@ -1,4 +1,4 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.classification.service;
|
||||
package com.knecon.fforesight.service.layoutparser.processor.services;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
@ -9,16 +9,20 @@ import org.apache.pdfbox.pdmodel.PDPage;
|
||||
import org.apache.pdfbox.pdmodel.common.PDRectangle;
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.adapter.ImageServiceResponseAdapter;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.adapter.model.table.TableCells;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.model.AbstractPageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.model.ClassificationDocument;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.model.ClassificationPage;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.model.image.ClassifiedImage;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.model.table.CleanRulings;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.model.text.TextPageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.model.text.TextPositionSequence;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.parsing.PDFLinesTextStripper;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationDocument;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.image.ClassifiedImage;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.parsing.PDFLinesTextStripper;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.blockification.DocuMineBlockificationService;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.blockification.RedactManagerBlockificationService;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.blockification.TaasBlockificationService;
|
||||
|
||||
import lombok.RequiredArgsConstructor;
|
||||
import lombok.SneakyThrows;
|
||||
@ -31,11 +35,16 @@ public class PdfParsingService {
|
||||
|
||||
private final RulingCleaningService rulingCleaningService;
|
||||
private final TableExtractionService tableExtractionService;
|
||||
private final BlockificationService blockificationService;
|
||||
private final ImageServiceResponseAdapter imageServiceResponseAdapter;
|
||||
private final TaasBlockificationService taasBlockificationService;
|
||||
private final DocuMineBlockificationService docuMineBlockificationService;
|
||||
private final RedactManagerBlockificationService redactManagerBlockificationService;
|
||||
|
||||
|
||||
public ClassificationDocument parseDocument(PDDocument originDocument, Map<Integer, List<TableCells>> pdfTableCells, Map<Integer, List<ClassifiedImage>> pdfImages) {
|
||||
public ClassificationDocument parseDocument(LayoutParsingType layoutParsingType,
|
||||
PDDocument originDocument,
|
||||
Map<Integer, List<TableCells>> pdfTableCells,
|
||||
Map<Integer, List<ClassifiedImage>> pdfImages) {
|
||||
|
||||
ClassificationDocument document = new ClassificationDocument();
|
||||
List<ClassificationPage> classificationPages = new ArrayList<>();
|
||||
@ -44,7 +53,7 @@ public class PdfParsingService {
|
||||
long pageCount = originDocument.getNumberOfPages();
|
||||
|
||||
for (int pageNumber = 1; pageNumber <= pageCount; pageNumber++) {
|
||||
parsePage(pdfImages, originDocument, pdfTableCells, document, classificationPages, pageNumber);
|
||||
parsePage(layoutParsingType, pdfImages, originDocument, pdfTableCells, document, classificationPages, pageNumber);
|
||||
}
|
||||
|
||||
document.setPages(classificationPages);
|
||||
@ -54,7 +63,8 @@ public class PdfParsingService {
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
private void parsePage(Map<Integer, List<ClassifiedImage>> pdfImages,
|
||||
private void parsePage(LayoutParsingType layoutParsingType,
|
||||
Map<Integer, List<ClassifiedImage>> pdfImages,
|
||||
PDDocument pdDocument,
|
||||
Map<Integer, List<TableCells>> pdfTableCells,
|
||||
ClassificationDocument document,
|
||||
@ -79,7 +89,12 @@ public class PdfParsingService {
|
||||
stripper.getRulings(),
|
||||
stripper.getMinCharWidth(),
|
||||
stripper.getMaxCharHeight());
|
||||
ClassificationPage classificationPage = blockificationService.blockify(stripper.getTextPositionSequences(), cleanRulings.getHorizontal(), cleanRulings.getVertical());
|
||||
|
||||
ClassificationPage classificationPage = switch (layoutParsingType) {
|
||||
case REDACT_MANAGER -> redactManagerBlockificationService.blockify(stripper.getTextPositionSequences(), cleanRulings.getHorizontal(), cleanRulings.getVertical());
|
||||
case TAAS -> taasBlockificationService.blockify(stripper.getTextPositionSequences(), cleanRulings.getHorizontal(), cleanRulings.getVertical());
|
||||
case DOCUMINE -> docuMineBlockificationService.blockify(stripper.getTextPositionSequences(), cleanRulings.getHorizontal(), cleanRulings.getVertical());
|
||||
};
|
||||
|
||||
classificationPage.setRotation(rotation);
|
||||
classificationPage.setLandscape(isLandscape);
|
||||
@ -1,95 +0,0 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.services;
|
||||
|
||||
import static java.lang.String.format;
|
||||
|
||||
import java.awt.geom.Area;
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
import java.util.function.BiConsumer;
|
||||
import java.util.function.BinaryOperator;
|
||||
import java.util.function.Function;
|
||||
import java.util.function.Supplier;
|
||||
import java.util.stream.Collector;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.graph.textblock.AtomicTextBlock;
|
||||
|
||||
import lombok.experimental.UtilityClass;
|
||||
|
||||
@UtilityClass
|
||||
public class RectangleTransformations {
|
||||
|
||||
public static Rectangle2D pad(Rectangle2D rectangle2D, int deltaX, int deltaY) {
|
||||
|
||||
return new Rectangle2D.Double(rectangle2D.getMinX() - deltaX, rectangle2D.getMinY() - deltaY, rectangle2D.getWidth() + 2 * deltaX, rectangle2D.getHeight() + 2 * deltaY);
|
||||
}
|
||||
|
||||
|
||||
public static Rectangle2D bBoxUnionAtomicTextBlock(List<AtomicTextBlock> atomicTextBlocks) {
|
||||
|
||||
return atomicTextBlocks.stream().flatMap(atomicTextBlock -> atomicTextBlock.getPositions().stream()).collect(new Rectangle2DUnion());
|
||||
}
|
||||
|
||||
|
||||
public static Rectangle2D rectangleUnion(List<Rectangle2D> rectangle2DList) {
|
||||
|
||||
return rectangle2DList.stream().collect(new Rectangle2DUnion());
|
||||
}
|
||||
|
||||
|
||||
public static String toString(Rectangle2D rectangle2D) {
|
||||
|
||||
return format("%f,%f,%f,%f", rectangle2D.getX(), rectangle2D.getY(), rectangle2D.getWidth(), rectangle2D.getHeight());
|
||||
}
|
||||
|
||||
|
||||
public static Rectangle2D parseRectangle2D(String bBox) {
|
||||
|
||||
List<Float> floats = Arrays.stream(bBox.split(",")).map(Float::parseFloat).toList();
|
||||
return new Rectangle2D.Float(floats.get(0), floats.get(1), floats.get(2), floats.get(3));
|
||||
}
|
||||
|
||||
|
||||
private static class Rectangle2DUnion implements Collector<Rectangle2D, Area, Rectangle2D> {
|
||||
|
||||
@Override
|
||||
public Supplier<Area> supplier() {
|
||||
|
||||
return Area::new;
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public BiConsumer<Area, Rectangle2D> accumulator() {
|
||||
|
||||
return (area, rectangle2D) -> area.add(new Area(rectangle2D));
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public BinaryOperator<Area> combiner() {
|
||||
|
||||
return (area1, area2) -> {
|
||||
area1.add(area2);
|
||||
return area1;
|
||||
};
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public Function<Area, Rectangle2D> finisher() {
|
||||
|
||||
return Area::getBounds2D;
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public Set<Characteristics> characteristics() {
|
||||
|
||||
return Set.of(Characteristics.CONCURRENT, Characteristics.UNORDERED);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
@ -1,4 +1,4 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.classification.service;
|
||||
package com.knecon.fforesight.service.layoutparser.processor.services;
|
||||
|
||||
import java.awt.geom.Line2D;
|
||||
import java.awt.geom.Point2D;
|
||||
@ -13,9 +13,9 @@ import java.util.Map;
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.adapter.model.table.TableCells;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.model.table.CleanRulings;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.model.table.Ruling;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.utils.DoubleComparisons;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling;
|
||||
currentCons> com.knecon.fforesight.service.layoutparser.processor.utils.DoubleComparisons;
|
||||
|
||||
import lombok.RequiredArgsConstructor;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
@ -0,0 +1,146 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.services;
|
||||
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Objects;
|
||||
import java.util.Set;
|
||||
import java.util.function.BiConsumer;
|
||||
import java.util.function.BinaryOperator;
|
||||
import java.util.function.Function;
|
||||
import java.util.function.Supplier;
|
||||
import java.util.stream.Collector;
|
||||
import java.util.stream.Stream;
|
||||
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.Point;
|
||||
import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.section.CellRectangle;
|
||||
import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.section.SectionGrid;
|
||||
import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.section.SectionRectangle;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeType;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.Document;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.Page;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.SemanticNode;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.Table;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.TableCell;
|
||||
|
||||
import lombok.RequiredArgsConstructor;
|
||||
|
||||
@Service
|
||||
@RequiredArgsConstructor
|
||||
public class SectionGridCreatorService {
|
||||
|
||||
public SectionGrid createSectionGrid(Document document) {
|
||||
|
||||
Map<Integer, List<SectionRectangle>> sectionBBox = document.streamAllSubNodesOfType(NodeType.SECTION).map(SemanticNode::getBBox).collect(new SectionGridCollector());
|
||||
Map<Integer, List<SectionRectangle>> paragraphBBox = document.streamAllSubNodesOfType(NodeType.PARAGRAPH).map(SemanticNode::getBBox).collect(new SectionGridCollector());
|
||||
Map<Integer, List<SectionRectangle>> headlineBBox = document.streamAllSubNodesOfType(NodeType.HEADLINE).map(SemanticNode::getBBox).collect(new SectionGridCollector());
|
||||
Map<Integer, List<SectionRectangle>> tableBBox = document.streamAllSubNodesOfType(NodeType.TABLE).map(node -> (Table) node).collect(new TableGridCollector());
|
||||
var sectionGrid = new SectionGrid();
|
||||
|
||||
sectionGrid.setRectanglesPerPage(mergeMapsByConcatenatingLists(//
|
||||
mergeMapsByConcatenatingLists(paragraphBBox, headlineBBox), //
|
||||
mergeMapsByConcatenatingLists(sectionBBox, tableBBox)));
|
||||
|
||||
return sectionGrid;
|
||||
}
|
||||
|
||||
|
||||
private static abstract class GridCollector<T> implements Collector<T, Map<Integer, List<SectionRectangle>>, Map<Integer, List<SectionRectangle>>> {
|
||||
|
||||
@Override
|
||||
public Supplier<Map<Integer, List<SectionRectangle>>> supplier() {
|
||||
|
||||
return HashMap::new;
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public Function<Map<Integer, List<SectionRectangle>>, Map<Integer, List<SectionRectangle>>> finisher() {
|
||||
|
||||
return Function.identity();
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public BinaryOperator<Map<Integer, List<SectionRectangle>>> combiner() {
|
||||
|
||||
return SectionGridCreatorService::mergeMapsByConcatenatingLists;
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public Set<Characteristics> characteristics() {
|
||||
|
||||
return Set.of(Characteristics.IDENTITY_FINISH, Characteristics.CONCURRENT, Characteristics.UNORDERED);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
private static class TableGridCollector extends GridCollector<Table> {
|
||||
|
||||
@Override
|
||||
public BiConsumer<Map<Integer, List<SectionRectangle>>, Table> accumulator() {
|
||||
|
||||
return (map, table) -> table.getPages()
|
||||
.forEach(page -> map.merge(page.getNumber(), List.of(toSectionRectangle(table, page, table.getPages().size())), SectionGridCreatorService::concatLists));
|
||||
}
|
||||
|
||||
|
||||
private static SectionRectangle toSectionRectangle(Table table, Page page, int numberOfParts) {
|
||||
|
||||
Rectangle2D rect = table.getBBox().get(page);
|
||||
List<CellRectangle> tableCellRectangles = table.streamTableCells()
|
||||
.map(TableCell::getBBox)
|
||||
.map(map -> map.get(page))
|
||||
.filter(Objects::nonNull)
|
||||
.map(rectangle2D -> new CellRectangle(new Point((float) rectangle2D.getX(), (float) rectangle2D.getY()),
|
||||
(float) rectangle2D.getWidth(),
|
||||
(float) rectangle2D.getHeight()))
|
||||
.toList();
|
||||
return new SectionRectangle(new Point((float) rect.getX(), (float) rect.getY()),
|
||||
(float) rect.getWidth(),
|
||||
(float) rect.getHeight(),
|
||||
1,
|
||||
numberOfParts,
|
||||
tableCellRectangles);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
private static class SectionGridCollector extends GridCollector<Map<Page, Rectangle2D>> {
|
||||
|
||||
@Override
|
||||
public BiConsumer<Map<Integer, List<SectionRectangle>>, Map<Page, Rectangle2D>> accumulator() {
|
||||
|
||||
return (mapToKeep, mapToMerge) -> mapToMerge.forEach((page, rectangle) -> mapToKeep.merge(page.getNumber(),
|
||||
List.of(toSectionRectangle(rectangle, mapToMerge.values().size())),
|
||||
SectionGridCreatorService::concatLists));
|
||||
|
||||
}
|
||||
|
||||
|
||||
private static SectionRectangle toSectionRectangle(Rectangle2D rect, int numberOfParts) {
|
||||
|
||||
return new SectionRectangle(new Point((float) rect.getX(), (float) rect.getY()), (float) rect.getWidth(), (float) rect.getHeight(), 1, numberOfParts, null);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
private static Map<Integer, List<SectionRectangle>> mergeMapsByConcatenatingLists(Map<Integer, List<SectionRectangle>> mapToKeep,
|
||||
Map<Integer, List<SectionRectangle>> mapToMerge) {
|
||||
|
||||
mapToMerge.forEach((page, rectangle) -> mapToKeep.merge(page, rectangle, SectionGridCreatorService::concatLists));
|
||||
return mapToKeep;
|
||||
}
|
||||
|
||||
|
||||
private static List<SectionRectangle> concatLists(List<SectionRectangle> l1, List<SectionRectangle> l2) {
|
||||
|
||||
return Stream.concat(l1.stream(), l2.stream()).toList();
|
||||
}
|
||||
|
||||
}
|
||||
@ -1,4 +1,4 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.classification.service;
|
||||
package com.knecon.fforesight.service.layoutparser.processor.services;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collections;
|
||||
@ -9,18 +9,18 @@ import java.util.stream.Collectors;
|
||||
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.model.AbstractPageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.model.ClassificationDocument;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.model.ClassificationFooter;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.model.ClassificationHeader;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.model.ClassificationPage;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.model.ClassificationSection;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.model.PageBlockType;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.model.image.ClassifiedImage;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.model.table.Cell;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.model.table.TablePageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.model.text.TextPageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.model.text.UnclassifiedText;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationDocument;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationFooter;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationHeader;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationSection;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.PageBlockType;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.image.ClassifiedImage;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.Cell;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.TablePageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.UnclassifiedText;
|
||||
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
|
||||
@ -1,4 +1,4 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.classification.service;
|
||||
package com.knecon.fforesight.service.layoutparser.processor.services;
|
||||
|
||||
import java.awt.geom.Point2D;
|
||||
import java.util.ArrayList;
|
||||
@ -12,15 +12,15 @@ import java.util.Set;
|
||||
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.model.AbstractPageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.model.ClassificationPage;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.model.table.Cell;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.model.table.CleanRulings;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.model.table.Rectangle;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.model.table.Ruling;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.model.table.TablePageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.model.text.TextPageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.utils.DoubleComparisons;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.Cell;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.Rectangle;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.TablePageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.DoubleComparisons;
|
||||
|
||||
@Service
|
||||
public class TableExtractionService {
|
||||
@ -0,0 +1,75 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.services;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.util.Collection;
|
||||
import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import org.apache.pdfbox.Loader;
|
||||
import org.apache.pdfbox.pdmodel.PDDocument;
|
||||
import org.apache.pdfbox.pdmodel.PDPage;
|
||||
import org.springframework.core.io.ClassPathResource;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.TextPositionsWithPage;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.parsing.PDFLinesTextStripper;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.TextPositionSequenceComparator;
|
||||
|
||||
import lombok.experimental.UtilityClass;
|
||||
|
||||
@UtilityClass
|
||||
public class TextPositionSequenceExtractionService {
|
||||
|
||||
public List<TextPositionsWithPage> getSortedTextPositionsWithPages(String filename) throws IOException {
|
||||
|
||||
List<TextPositionsWithPage> textPositionSequencesPerPage = new LinkedList<>();
|
||||
try (InputStream inputStream = new ClassPathResource(filename).getInputStream()) {
|
||||
|
||||
PDDocument pdDocument = Loader.loadPDF(inputStream);
|
||||
|
||||
for (int pageNumber = 1; pageNumber < pdDocument.getNumberOfPages() + 1; pageNumber++) {
|
||||
|
||||
PDFLinesTextStripper stripper = new PDFLinesTextStripper();
|
||||
PDPage pdPage = pdDocument.getPage(pageNumber - 1);
|
||||
stripper.setPageNumber(pageNumber);
|
||||
stripper.setStartPage(pageNumber);
|
||||
stripper.setEndPage(pageNumber);
|
||||
stripper.setPdpage(pdPage);
|
||||
stripper.getText(pdDocument);
|
||||
|
||||
// var sortedTextPositionSequences = stripper.getTextPositionSequences();
|
||||
Map<Float, List<TextPositionSequence>> sortedTextPositionSequencesPerDir = stripper.getTextPositionSequences()
|
||||
.stream()
|
||||
.sorted(new TextPositionSequenceComparator())
|
||||
.collect(Collectors.groupingBy(textPositionSequence -> textPositionSequence.getDir().getDegrees()));
|
||||
|
||||
var sortedTextPositionSequences = sortByDirAccordingToPageRotation(sortedTextPositionSequencesPerDir, pdPage.getRotation());
|
||||
|
||||
textPositionSequencesPerPage.add(new TextPositionsWithPage(sortedTextPositionSequences, RectangleTransformations.toRectangle2D(pdPage.getCropBox())));
|
||||
}
|
||||
|
||||
pdDocument.close();
|
||||
}
|
||||
|
||||
return textPositionSequencesPerPage;
|
||||
}
|
||||
|
||||
|
||||
public List<TextPositionSequence> sortByDirAccordingToPageRotation(Map<Float, List<TextPositionSequence>> sortedTextPositionSequencesPerDir, int rotation) {
|
||||
|
||||
LinkedList<Float> sortedKeys = new LinkedList<>(sortedTextPositionSequencesPerDir.keySet().stream().sorted().toList());
|
||||
|
||||
for (int i = 0; i < sortedKeys.size(); i++) {
|
||||
if (sortedKeys.get(i) < rotation) {
|
||||
Float keyToSwap = sortedKeys.remove(i);
|
||||
sortedKeys.addLast(keyToSwap);
|
||||
}
|
||||
}
|
||||
return sortedKeys.stream().map(sortedTextPositionSequencesPerDir::get).flatMap(Collection::stream).toList();
|
||||
}
|
||||
|
||||
}
|
||||
@ -0,0 +1,229 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.services.blockification;
|
||||
|
||||
import static java.util.stream.Collectors.toSet;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Comparator;
|
||||
import java.util.List;
|
||||
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.FloatFrequencyCounter;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.Orientation;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.StringFrequencyCounter;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.RulingTextDirAdjustUtil;
|
||||
|
||||
@Service
|
||||
public class DocuMineBlockificationService {
|
||||
|
||||
static final float THRESHOLD = 1f;
|
||||
|
||||
|
||||
/**
|
||||
* This method is building blocks by expanding the minX/maxX and minY/maxY value on each word that is not split by the conditions.
|
||||
* This method must use text direction adjusted postions (DirAdj). Where {0,0} is on the upper left. Never try to change this!
|
||||
* Rulings (Table lines) must be adjusted to the text directions as well, when checking if a block is split by a ruling.
|
||||
*
|
||||
* @param textPositions The words of a page.
|
||||
* @param horizontalRulingLines Horizontal table lines.
|
||||
* @param verticalRulingLines Vertical table lines.
|
||||
* @return Page object that contains the Textblock and text statistics.
|
||||
*/
|
||||
public ClassificationPage blockify(List<TextPositionSequence> textPositions, List<Ruling> horizontalRulingLines, List<Ruling> verticalRulingLines) {
|
||||
|
||||
List<TextPositionSequence> chunkWords = new ArrayList<>();
|
||||
List<AbstractPageBlock> chunkBlockList1 = new ArrayList<>();
|
||||
|
||||
float minX = 1000, maxX = 0, minY = 1000, maxY = 0;
|
||||
TextPositionSequence prev = null;
|
||||
|
||||
boolean wasSplitted = false;
|
||||
Float splitX1 = null;
|
||||
for (TextPositionSequence word : textPositions) {
|
||||
|
||||
boolean lineSeparation = prev != null && word.getMinYDirAdj() - maxY > Math.min(word.getHeight(), prev.getHeight()) * 1.25;
|
||||
boolean startFromTop = prev != null && word.getMinYDirAdj() < prev.getMinYDirAdj() - prev.getTextHeight();
|
||||
boolean splitByX = prev != null && maxX + 50 < word.getMinXDirAdj() && prev.getMinYDirAdj() == word.getMinYDirAdj();
|
||||
boolean negativeXGap = prev != null && word.getMinXDirAdj() - minX < -5;
|
||||
boolean newLineAfterSplit = prev != null && word.getMinYDirAdj() != prev.getMinYDirAdj() && wasSplitted && splitX1 != word.getMinXDirAdj();
|
||||
boolean isSplitByRuling = isSplitByRuling(minX, minY, maxX, maxY, word, horizontalRulingLines, verticalRulingLines);
|
||||
boolean splitByDir = prev != null && !prev.getDir().equals(word.getDir());
|
||||
boolean splitByOtherFontAndOtherY = prev != null && prev.getMaxYDirAdj() != word.getMaxYDirAdj() && (word.getFontStyle().contains("bold") && !prev.getFontStyle()
|
||||
.contains("bold") || prev.getFontStyle().contains("bold") && !word.getFontStyle().contains("bold"));
|
||||
|
||||
if (prev != null && (lineSeparation || startFromTop || splitByDir || isSplitByRuling || splitByOtherFontAndOtherY || negativeXGap)) {
|
||||
|
||||
Orientation prevOrientation = null;
|
||||
if (!chunkBlockList1.isEmpty()) {
|
||||
prevOrientation = chunkBlockList1.get(chunkBlockList1.size() - 1).getOrientation();
|
||||
}
|
||||
|
||||
TextPageBlock cb1 = buildTextBlock(chunkWords);
|
||||
chunkBlockList1.add(cb1);
|
||||
chunkWords = new ArrayList<>();
|
||||
|
||||
if (splitByX && !isSplitByRuling) {
|
||||
wasSplitted = true;
|
||||
cb1.setOrientation(Orientation.LEFT);
|
||||
splitX1 = word.getMinXDirAdj();
|
||||
} else if (newLineAfterSplit && !isSplitByRuling) {
|
||||
wasSplitted = false;
|
||||
cb1.setOrientation(Orientation.RIGHT);
|
||||
splitX1 = null;
|
||||
} else if (prevOrientation != null && prevOrientation.equals(Orientation.RIGHT) && (lineSeparation || !startFromTop || !splitByX || !newLineAfterSplit || !isSplitByRuling)) {
|
||||
cb1.setOrientation(Orientation.LEFT);
|
||||
}
|
||||
|
||||
minX = 1000;
|
||||
maxX = 0;
|
||||
minY = 1000;
|
||||
maxY = 0;
|
||||
prev = null;
|
||||
}
|
||||
|
||||
chunkWords.add(word);
|
||||
|
||||
prev = word;
|
||||
if (word.getMinXDirAdj() < minX) {
|
||||
minX = word.getMinXDirAdj();
|
||||
}
|
||||
if (word.getMaxXDirAdj() > maxX) {
|
||||
maxX = word.getMaxXDirAdj();
|
||||
}
|
||||
if (word.getMinYDirAdj() < minY) {
|
||||
minY = word.getMinYDirAdj();
|
||||
}
|
||||
if (word.getMaxYDirAdj() > maxY) {
|
||||
maxY = word.getMaxYDirAdj();
|
||||
}
|
||||
}
|
||||
|
||||
TextPageBlock cb1 = buildTextBlock(chunkWords);
|
||||
if (cb1 != null) {
|
||||
chunkBlockList1.add(cb1);
|
||||
}
|
||||
|
||||
return new ClassificationPage(chunkBlockList1);
|
||||
}
|
||||
|
||||
|
||||
private boolean equalsWithThreshold(float f1, float f2) {
|
||||
|
||||
return Math.abs(f1 - f2) < THRESHOLD;
|
||||
}
|
||||
|
||||
|
||||
private TextPageBlock buildTextBlock(List<TextPositionSequence> wordBlockList) {
|
||||
|
||||
TextPageBlock textBlock = null;
|
||||
|
||||
FloatFrequencyCounter lineHeightFrequencyCounter = new FloatFrequencyCounter();
|
||||
FloatFrequencyCounter fontSizeFrequencyCounter = new FloatFrequencyCounter();
|
||||
FloatFrequencyCounter spaceFrequencyCounter = new FloatFrequencyCounter();
|
||||
StringFrequencyCounter fontFrequencyCounter = new StringFrequencyCounter();
|
||||
StringFrequencyCounter styleFrequencyCounter = new StringFrequencyCounter();
|
||||
|
||||
for (TextPositionSequence wordBlock : wordBlockList) {
|
||||
|
||||
lineHeightFrequencyCounter.add(wordBlock.getTextHeight());
|
||||
fontSizeFrequencyCounter.add(wordBlock.getFontSize());
|
||||
spaceFrequencyCounter.add(wordBlock.getSpaceWidth());
|
||||
fontFrequencyCounter.add(wordBlock.getFont());
|
||||
styleFrequencyCounter.add(wordBlock.getFontStyle());
|
||||
|
||||
if (textBlock == null) {
|
||||
textBlock = new TextPageBlock(wordBlock.getMinXDirAdj(),
|
||||
wordBlock.getMaxXDirAdj(),
|
||||
wordBlock.getMinYDirAdj(),
|
||||
wordBlock.getMaxYDirAdj(),
|
||||
wordBlockList,
|
||||
wordBlock.getRotation());
|
||||
} else {
|
||||
TextPageBlock spatialEntity = textBlock.union(wordBlock);
|
||||
textBlock.resize(spatialEntity.getMinX(), spatialEntity.getMinY(), spatialEntity.getWidth(), spatialEntity.getHeight());
|
||||
}
|
||||
}
|
||||
|
||||
if (textBlock != null) {
|
||||
textBlock.setMostPopularWordFont(fontFrequencyCounter.getMostPopular());
|
||||
textBlock.setMostPopularWordStyle(styleFrequencyCounter.getMostPopular());
|
||||
textBlock.setMostPopularWordFontSize(fontSizeFrequencyCounter.getMostPopular());
|
||||
textBlock.setMostPopularWordHeight(lineHeightFrequencyCounter.getMostPopular());
|
||||
textBlock.setMostPopularWordSpaceWidth(spaceFrequencyCounter.getMostPopular());
|
||||
textBlock.setHighestFontSize(fontSizeFrequencyCounter.getHighest());
|
||||
}
|
||||
|
||||
if (textBlock != null && textBlock.getSequences() != null && textBlock.getSequences().stream().map(t -> round(t.getMinYDirAdj(), 3)).collect(toSet()).size() == 1) {
|
||||
textBlock.getSequences().sort(Comparator.comparing(TextPositionSequence::getMinXDirAdj));
|
||||
}
|
||||
return textBlock;
|
||||
}
|
||||
|
||||
|
||||
private boolean isSplitByRuling(float minX,
|
||||
float minY,
|
||||
float maxX,
|
||||
float maxY,
|
||||
TextPositionSequence word,
|
||||
List<Ruling> horizontalRulingLines,
|
||||
List<Ruling> verticalRulingLines) {
|
||||
|
||||
return isSplitByRuling(maxX,
|
||||
minY,
|
||||
word.getMinXDirAdj(),
|
||||
word.getMinYDirAdj(),
|
||||
verticalRulingLines,
|
||||
word.getDir().getDegrees(),
|
||||
word.getPageWidth(),
|
||||
word.getPageHeight()) //
|
||||
|| isSplitByRuling(minX,
|
||||
minY,
|
||||
word.getMinXDirAdj(),
|
||||
word.getMaxYDirAdj(),
|
||||
horizontalRulingLines,
|
||||
word.getDir().getDegrees(),
|
||||
word.getPageWidth(),
|
||||
word.getPageHeight()) //
|
||||
|| isSplitByRuling(maxX,
|
||||
minY,
|
||||
word.getMinXDirAdj(),
|
||||
word.getMinYDirAdj(),
|
||||
horizontalRulingLines,
|
||||
word.getDir().getDegrees(),
|
||||
word.getPageWidth(),
|
||||
word.getPageHeight()) //
|
||||
|| isSplitByRuling(minX,
|
||||
minY,
|
||||
word.getMinXDirAdj(),
|
||||
word.getMaxYDirAdj(),
|
||||
verticalRulingLines,
|
||||
word.getDir().getDegrees(),
|
||||
word.getPageWidth(),
|
||||
word.getPageHeight()); //
|
||||
}
|
||||
|
||||
|
||||
private boolean isSplitByRuling(float previousX2, float previousY1, float currentX1, float currentY1, List<Ruling> rulingLines, float dir, float pageWidth, float pageHeight) {
|
||||
|
||||
for (Ruling ruling : rulingLines) {
|
||||
var line = RulingTextDirAdjustUtil.convertToDirAdj(ruling, dir, pageWidth, pageHeight);
|
||||
if (line.intersectsLine(previousX2, previousY1, currentX1, currentY1)) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
private double round(float value, int decimalPoints) {
|
||||
|
||||
var d = Math.pow(10, decimalPoints);
|
||||
return Math.round(value * d) / d;
|
||||
}
|
||||
|
||||
}
|
||||
@ -0,0 +1,278 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.services.blockification;
|
||||
|
||||
import static java.util.stream.Collectors.toSet;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Comparator;
|
||||
import java.util.Iterator;
|
||||
import java.util.List;
|
||||
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.FloatFrequencyCounter;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.Orientation;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.StringFrequencyCounter;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.RulingTextDirAdjustUtil;
|
||||
|
||||
@SuppressWarnings("all")
|
||||
@Service
|
||||
public class RedactManagerBlockificationService {
|
||||
|
||||
static final float THRESHOLD = 1f;
|
||||
|
||||
|
||||
/**
|
||||
* This method is building blocks by expanding the minX/maxX and minY/maxY value on each word that is not split by the conditions.
|
||||
* This method must use text direction adjusted postions (DirAdj). Where {0,0} is on the upper left. Never try to change this!
|
||||
* Rulings (TablePageBlock lines) must be adjusted to the text directions as well, when checking if a block is split by a ruling.
|
||||
*
|
||||
* @param textPositions The words of a page.
|
||||
* @param horizontalRulingLines Horizontal table lines.
|
||||
* @param verticalRulingLines Vertical table lines.
|
||||
* @return Page object that contains the Textblock and text statistics.
|
||||
*/
|
||||
public ClassificationPage blockify(List<TextPositionSequence> textPositions, List<Ruling> horizontalRulingLines, List<Ruling> verticalRulingLines) {
|
||||
|
||||
int indexOnPage = 0;
|
||||
List<TextPositionSequence> chunkWords = new ArrayList<>();
|
||||
List<AbstractPageBlock> chunkBlockList = new ArrayList<>();
|
||||
|
||||
float minX = 1000, maxX = 0, minY = 1000, maxY = 0;
|
||||
TextPositionSequence prev = null;
|
||||
|
||||
boolean wasSplitted = false;
|
||||
Float splitX1 = null;
|
||||
for (TextPositionSequence word : textPositions) {
|
||||
|
||||
boolean lineSeparation = word.getMinYDirAdj() - maxY > word.getHeight() * 1.25;
|
||||
boolean startFromTop = prev != null && word.getMinYDirAdj() < prev.getMinYDirAdj() - prev.getTextHeight();
|
||||
boolean splitByX = prev != null && maxX + 50 < word.getMinXDirAdj() && prev.getMinYDirAdj() == word.getMinYDirAdj();
|
||||
boolean xIsBeforeFirstX = prev != null && word.getMinXDirAdj() < minX;
|
||||
boolean newLineAfterSplit = prev != null && word.getMinYDirAdj() != prev.getMinYDirAdj() && wasSplitted && splitX1 != word.getMinXDirAdj();
|
||||
boolean isSplitByRuling = isSplitByRuling(minX, minY, maxX, maxY, word, horizontalRulingLines, verticalRulingLines);
|
||||
boolean splitByDir = prev != null && !prev.getDir().equals(word.getDir());
|
||||
|
||||
if (prev != null && (lineSeparation || startFromTop || splitByX || splitByDir || isSplitByRuling)) {
|
||||
|
||||
Orientation prevOrientation = null;
|
||||
if (!chunkBlockList.isEmpty()) {
|
||||
prevOrientation = chunkBlockList.get(chunkBlockList.size() - 1).getOrientation();
|
||||
}
|
||||
|
||||
TextPageBlock cb1 = buildTextBlock(chunkWords, indexOnPage);
|
||||
indexOnPage++;
|
||||
|
||||
chunkBlockList.add(cb1);
|
||||
chunkWords = new ArrayList<>();
|
||||
|
||||
if (splitByX && !isSplitByRuling) {
|
||||
wasSplitted = true;
|
||||
cb1.setOrientation(Orientation.LEFT);
|
||||
splitX1 = word.getMinXDirAdj();
|
||||
} else if (newLineAfterSplit && !isSplitByRuling) {
|
||||
wasSplitted = false;
|
||||
cb1.setOrientation(Orientation.RIGHT);
|
||||
splitX1 = null;
|
||||
} else if (prevOrientation != null && prevOrientation.equals(Orientation.RIGHT) && (lineSeparation || !startFromTop || !splitByX || !newLineAfterSplit || !isSplitByRuling)) {
|
||||
cb1.setOrientation(Orientation.LEFT);
|
||||
}
|
||||
|
||||
minX = 1000;
|
||||
maxX = 0;
|
||||
minY = 1000;
|
||||
maxY = 0;
|
||||
prev = null;
|
||||
}
|
||||
|
||||
chunkWords.add(word);
|
||||
|
||||
prev = word;
|
||||
if (word.getMinXDirAdj() < minX) {
|
||||
minX = word.getMinXDirAdj();
|
||||
}
|
||||
if (word.getMaxXDirAdj() > maxX) {
|
||||
maxX = word.getMaxXDirAdj();
|
||||
}
|
||||
if (word.getMinYDirAdj() < minY) {
|
||||
minY = word.getMinYDirAdj();
|
||||
}
|
||||
if (word.getMaxYDirAdj() > maxY) {
|
||||
maxY = word.getMaxYDirAdj();
|
||||
}
|
||||
}
|
||||
|
||||
TextPageBlock cb1 = buildTextBlock(chunkWords, indexOnPage);
|
||||
if (cb1 != null) {
|
||||
chunkBlockList.add(cb1);
|
||||
}
|
||||
|
||||
Iterator<AbstractPageBlock> itty = chunkBlockList.iterator();
|
||||
|
||||
TextPageBlock previousLeft = null;
|
||||
TextPageBlock previousRight = null;
|
||||
while (itty.hasNext()) {
|
||||
TextPageBlock block = (TextPageBlock) itty.next();
|
||||
|
||||
if (previousLeft != null && block.getOrientation().equals(Orientation.LEFT)) {
|
||||
if (previousLeft.getMinY() > block.getMinY() && block.getMaxY() + block.getMostPopularWordHeight() > previousLeft.getMinY()) {
|
||||
previousLeft.add(block);
|
||||
itty.remove();
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
if (previousRight != null && block.getOrientation().equals(Orientation.RIGHT)) {
|
||||
if (previousRight.getMinY() > block.getMinY() && block.getMaxY() + block.getMostPopularWordHeight() > previousRight.getMinY()) {
|
||||
previousRight.add(block);
|
||||
itty.remove();
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
if (block.getOrientation().equals(Orientation.LEFT)) {
|
||||
previousLeft = block;
|
||||
} else if (block.getOrientation().equals(Orientation.RIGHT)) {
|
||||
previousRight = block;
|
||||
}
|
||||
}
|
||||
|
||||
itty = chunkBlockList.iterator();
|
||||
TextPageBlock previous = null;
|
||||
while (itty.hasNext()) {
|
||||
TextPageBlock block = (TextPageBlock) itty.next();
|
||||
|
||||
if (previous != null && previous.getOrientation().equals(Orientation.LEFT) && block.getOrientation().equals(Orientation.LEFT) && equalsWithThreshold(block.getMaxY(),
|
||||
previous.getMaxY()) || previous != null && previous.getOrientation().equals(Orientation.LEFT) && block.getOrientation()
|
||||
.equals(Orientation.RIGHT) && equalsWithThreshold(block.getMaxY(), previous.getMaxY())) {
|
||||
previous.add(block);
|
||||
itty.remove();
|
||||
continue;
|
||||
}
|
||||
|
||||
previous = block;
|
||||
}
|
||||
|
||||
return new ClassificationPage(chunkBlockList);
|
||||
}
|
||||
|
||||
|
||||
private boolean equalsWithThreshold(float f1, float f2) {
|
||||
|
||||
return Math.abs(f1 - f2) < THRESHOLD;
|
||||
}
|
||||
|
||||
|
||||
private TextPageBlock buildTextBlock(List<TextPositionSequence> wordBlockList, int indexOnPage) {
|
||||
|
||||
TextPageBlock textBlock = null;
|
||||
|
||||
FloatFrequencyCounter lineHeightFrequencyCounter = new FloatFrequencyCounter();
|
||||
FloatFrequencyCounter fontSizeFrequencyCounter = new FloatFrequencyCounter();
|
||||
FloatFrequencyCounter spaceFrequencyCounter = new FloatFrequencyCounter();
|
||||
StringFrequencyCounter fontFrequencyCounter = new StringFrequencyCounter();
|
||||
StringFrequencyCounter styleFrequencyCounter = new StringFrequencyCounter();
|
||||
|
||||
for (TextPositionSequence wordBlock : wordBlockList) {
|
||||
|
||||
lineHeightFrequencyCounter.add(wordBlock.getTextHeight());
|
||||
fontSizeFrequencyCounter.add(wordBlock.getFontSize());
|
||||
spaceFrequencyCounter.add(wordBlock.getSpaceWidth());
|
||||
fontFrequencyCounter.add(wordBlock.getFont());
|
||||
styleFrequencyCounter.add(wordBlock.getFontStyle());
|
||||
|
||||
if (textBlock == null) {
|
||||
textBlock = new TextPageBlock(wordBlock.getMinXDirAdj(),
|
||||
wordBlock.getMaxXDirAdj(),
|
||||
wordBlock.getMinYDirAdj(),
|
||||
wordBlock.getMaxYDirAdj(),
|
||||
wordBlockList,
|
||||
wordBlock.getRotation());
|
||||
} else {
|
||||
TextPageBlock spatialEntity = textBlock.union(wordBlock);
|
||||
textBlock.resize(spatialEntity.getMinX(), spatialEntity.getMinY(), spatialEntity.getWidth(), spatialEntity.getHeight());
|
||||
}
|
||||
}
|
||||
|
||||
if (textBlock != null) {
|
||||
textBlock.setMostPopularWordFont(fontFrequencyCounter.getMostPopular());
|
||||
textBlock.setMostPopularWordStyle(styleFrequencyCounter.getMostPopular());
|
||||
textBlock.setMostPopularWordFontSize(fontSizeFrequencyCounter.getMostPopular());
|
||||
textBlock.setMostPopularWordHeight(lineHeightFrequencyCounter.getMostPopular());
|
||||
textBlock.setMostPopularWordSpaceWidth(spaceFrequencyCounter.getMostPopular());
|
||||
textBlock.setHighestFontSize(fontSizeFrequencyCounter.getHighest());
|
||||
}
|
||||
|
||||
if (textBlock != null && textBlock.getSequences() != null && textBlock.getSequences().stream().map(t -> round(t.getMinYDirAdj(), 3)).collect(toSet()).size() == 1) {
|
||||
textBlock.getSequences().sort(Comparator.comparing(TextPositionSequence::getMinXDirAdj));
|
||||
}
|
||||
return textBlock;
|
||||
}
|
||||
|
||||
|
||||
private boolean isSplitByRuling(float minX,
|
||||
float minY,
|
||||
float maxX,
|
||||
float maxY,
|
||||
TextPositionSequence word,
|
||||
List<Ruling> horizontalRulingLines,
|
||||
List<Ruling> verticalRulingLines) {
|
||||
|
||||
return isSplitByRuling(maxX,
|
||||
minY,
|
||||
word.getMinXDirAdj(),
|
||||
word.getMinYDirAdj(),
|
||||
verticalRulingLines,
|
||||
word.getDir().getDegrees(),
|
||||
word.getPageWidth(),
|
||||
word.getPageHeight()) //
|
||||
|| isSplitByRuling(minX,
|
||||
minY,
|
||||
word.getMinXDirAdj(),
|
||||
word.getMaxYDirAdj(),
|
||||
horizontalRulingLines,
|
||||
word.getDir().getDegrees(),
|
||||
word.getPageWidth(),
|
||||
word.getPageHeight()) //
|
||||
|| isSplitByRuling(maxX,
|
||||
minY,
|
||||
word.getMinXDirAdj(),
|
||||
word.getMinYDirAdj(),
|
||||
horizontalRulingLines,
|
||||
word.getDir().getDegrees(),
|
||||
word.getPageWidth(),
|
||||
word.getPageHeight()) //
|
||||
|| isSplitByRuling(minX,
|
||||
minY,
|
||||
word.getMinXDirAdj(),
|
||||
word.getMaxYDirAdj(),
|
||||
verticalRulingLines,
|
||||
word.getDir().getDegrees(),
|
||||
word.getPageWidth(),
|
||||
word.getPageHeight());
|
||||
}
|
||||
|
||||
|
||||
private boolean isSplitByRuling(float previousX2, float previousY1, float currentX1, float currentY1, List<Ruling> rulingLines, float dir, float pageWidth, float pageHeight) {
|
||||
|
||||
for (Ruling ruling : rulingLines) {
|
||||
var line = RulingTextDirAdjustUtil.convertToDirAdj(ruling, dir, pageWidth, pageHeight);
|
||||
if (line.intersectsLine(previousX2, previousY1, currentX1, currentY1)) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
private double round(float value, int decimalPoints) {
|
||||
|
||||
var d = Math.pow(10, decimalPoints);
|
||||
return Math.round(value * d) / d;
|
||||
}
|
||||
|
||||
}
|
||||
@ -1,4 +1,4 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.classification.service;
|
||||
package com.knecon.fforesight.service.layoutparser.processor.services.blockification;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Iterator;
|
||||
@ -9,17 +9,17 @@ import java.util.regex.Pattern;
|
||||
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.model.AbstractPageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.model.ClassificationPage;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.model.Orientation;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.model.table.Ruling;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.model.text.TextPageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.model.text.TextPositionSequence;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.utils.RulingTextDirAdjustUtil;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.Orientation;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.RulingTextDirAdjustUtil;
|
||||
|
||||
@Service
|
||||
@SuppressWarnings("all")
|
||||
public class BlockificationService {
|
||||
public class TaasBlockificationService {
|
||||
|
||||
private static final float THRESHOLD = 1f;
|
||||
private static final float Y_GAP_SPLIT_HEIGHT_MODIFIER = 1.25f;
|
||||
@ -137,7 +137,7 @@ public class BlockificationService {
|
||||
|
||||
float minX = 1000, maxX = 0, minY = 1000, maxY = 0;
|
||||
TextPositionSequence prev = null;
|
||||
|
||||
// TODO: make static final constant
|
||||
var listIdentitifier = Pattern.compile("\\b(?:[1-9]|1\\d|20|[ivxlc]|[a-z])\\s*(?:[.)])", Pattern.CASE_INSENSITIVE);
|
||||
|
||||
boolean wasSplitted = false;
|
||||
@ -146,7 +146,7 @@ public class BlockificationService {
|
||||
|
||||
Matcher listIdentifierPattern = listIdentitifier.matcher(word.toString());
|
||||
|
||||
boolean yGap = word.getMinYDirAdj() - maxY > word.getHeight() * Y_GAP_SPLIT_HEIGHT_MODIFIER;
|
||||
boolean yGap = prev != null && word.getMinYDirAdj() - maxY > Math.min(word.getHeight(), prev.getHeight()) * Y_GAP_SPLIT_HEIGHT_MODIFIER;
|
||||
boolean sameLine = prev != null && equalsWithThreshold(prev.getMinYDirAdj(), word.getMinYDirAdj());
|
||||
boolean positiveXGapInline = prev != null && maxX + X_GAP_SPLIT_CONSTANT < word.getMinXDirAdj() && sameLine;
|
||||
boolean negativeXGap = prev != null && word.getMinXDirAdj() - minX < -5;
|
||||
@ -0,0 +1,117 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.services.classification;
|
||||
|
||||
import java.util.List;
|
||||
import java.util.Locale;
|
||||
import java.util.regex.Matcher;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.Rectangle;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationDocument;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.PageBlockType;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.BodyTextFrameService;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.PositionUtils;
|
||||
|
||||
import lombok.RequiredArgsConstructor;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
|
||||
|
||||
@Slf4j
|
||||
@Service
|
||||
@RequiredArgsConstructor
|
||||
public class DocuMineClassificationService {
|
||||
|
||||
private final BodyTextFrameService bodyTextFrameService;
|
||||
private static final Pattern pattern = Pattern.compile("^(\\d{1,1}\\.?){1,3}\\d{1,2}\\.?\\s[0-9A-Za-z\\[\\]\\-]{2,50}", Pattern.CASE_INSENSITIVE);
|
||||
private static final Pattern pattern2 = Pattern.compile(".*\\d{4}$", Pattern.CASE_INSENSITIVE);
|
||||
private static final Pattern pattern3 = Pattern.compile("^(\\d{1,1}\\.){1,3}\\d{1,2}\\.?\\s[a-z]{1,2}\\/[a-z]{1,2}.*");
|
||||
|
||||
|
||||
public void classifyDocument(ClassificationDocument document) {
|
||||
|
||||
Rectangle bodyTextFrame = bodyTextFrameService.calculateBodyTextFrame(document.getPages(), document.getFontSizeCounter(), false);
|
||||
Rectangle landscapeBodyTextFrame = bodyTextFrameService.calculateBodyTextFrame(document.getPages(), document.getFontSizeCounter(), true);
|
||||
List<Float> headlineFontSizes = document.getFontSizeCounter().getHighterThanMostPopular();
|
||||
|
||||
log.debug("Document FontSize counters are: {}", document.getFontSizeCounter().getCountPerValue());
|
||||
|
||||
for (ClassificationPage page : document.getPages()) {
|
||||
bodyTextFrameService.setBodyTextFrameAdjustedToPage(page, bodyTextFrame, landscapeBodyTextFrame);
|
||||
classifyPage(page, document, headlineFontSizes);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
private void classifyPage(ClassificationPage page, ClassificationDocument document, List<Float> headlineFontSizes) {
|
||||
|
||||
for (AbstractPageBlock textBlock : page.getTextBlocks()) {
|
||||
if (textBlock instanceof TextPageBlock) {
|
||||
classifyBlock((TextPageBlock) textBlock, page, document, headlineFontSizes);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
private void classifyBlock(TextPageBlock textBlock, ClassificationPage page, ClassificationDocument document, List<Float> headlineFontSizes) {
|
||||
|
||||
log.debug("headlineFontSizes: {}", headlineFontSizes);
|
||||
var bodyTextFrame = page.getBodyTextFrame();
|
||||
|
||||
Matcher matcher = pattern.matcher(textBlock.toString());
|
||||
Matcher matcher2 = pattern2.matcher(textBlock.toString());
|
||||
Matcher matcher3 = pattern3.matcher(textBlock.toString());
|
||||
|
||||
if (document.getFontSizeCounter().getMostPopular() == null) {
|
||||
textBlock.setClassification(PageBlockType.OTHER);
|
||||
return;
|
||||
}
|
||||
if (textBlock.getText().length() > 6 && (textBlock.getMostPopularWordHeight() > document.getTextHeightCounter()
|
||||
.getMostPopular() || textBlock.getMostPopularWordFontSize() > document.getFontSizeCounter().getMostPopular()) && PositionUtils.getApproxLineCount(textBlock) < 5.9
|
||||
|
||||
&& (textBlock.getMostPopularWordStyle().contains("bold") && Character.isDigit(textBlock.toString().charAt(0)) && !matcher2.matches() && !textBlock.toString()
|
||||
.contains(":") || textBlock.toString().equals(textBlock.toString().toUpperCase(Locale.ROOT)) && !matcher2.matches() && !textBlock.toString()
|
||||
.contains(":") || textBlock.toString().startsWith("APPENDIX") || textBlock.toString().startsWith("FIGURE") || textBlock.toString()
|
||||
.startsWith("TABLE")) && !textBlock.toString().endsWith(":")) {
|
||||
textBlock.setClassification(PageBlockType.getHeadlineType(1));
|
||||
document.setHeadlines(true);
|
||||
|
||||
} else if (matcher.find() && PositionUtils.getApproxLineCount(textBlock) < 2.9 && !matcher3.matches() && !matcher2.matches()) {
|
||||
textBlock.setClassification(PageBlockType.getHeadlineType(2));
|
||||
document.setHeadlines(true);
|
||||
} else if (PositionUtils.isOverBodyTextFrame(bodyTextFrame, textBlock, page.getRotation()) && (document.getFontSizeCounter()
|
||||
.getMostPopular() == null || textBlock.getHighestFontSize() <= document.getFontSizeCounter().getMostPopular())) {
|
||||
textBlock.setClassification(PageBlockType.HEADER);
|
||||
|
||||
} else if (PositionUtils.isUnderBodyTextFrame(bodyTextFrame, textBlock, page.getRotation()) && (document.getFontSizeCounter()
|
||||
.getMostPopular() == null || textBlock.getHighestFontSize() <= document.getFontSizeCounter().getMostPopular())) {
|
||||
textBlock.setClassification(PageBlockType.FOOTER);
|
||||
} else if (page.getPageNumber() == 1 && (PositionUtils.getHeightDifferenceBetweenChunkWordAndDocumentWord(textBlock,
|
||||
document.getTextHeightCounter().getMostPopular()) > 2.5 && textBlock.getHighestFontSize() > document.getFontSizeCounter().getMostPopular() || page.getTextBlocks()
|
||||
.size() == 1)) {
|
||||
if (!Pattern.matches("[0-9]+", textBlock.toString())) {
|
||||
textBlock.setClassification(PageBlockType.TITLE);
|
||||
}
|
||||
} else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock) && textBlock.getMostPopularWordFontSize() == document.getFontSizeCounter()
|
||||
.getMostPopular() && textBlock.getMostPopularWordStyle().equals("bold") && !document.getFontStyleCounter().getMostPopular().equals("bold")) {
|
||||
textBlock.setClassification(PageBlockType.PARAGRAPH_BOLD);
|
||||
} else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock) && textBlock.getMostPopularWordFont()
|
||||
.equals(document.getFontCounter().getMostPopular()) && textBlock.getMostPopularWordStyle()
|
||||
.equals(document.getFontStyleCounter().getMostPopular()) && textBlock.getMostPopularWordFontSize() == document.getFontSizeCounter().getMostPopular()) {
|
||||
textBlock.setClassification(PageBlockType.PARAGRAPH);
|
||||
} else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock) && textBlock.getMostPopularWordFontSize() == document.getFontSizeCounter()
|
||||
.getMostPopular() && textBlock.getMostPopularWordStyle().equals("italic") && !document.getFontStyleCounter()
|
||||
.getMostPopular()
|
||||
.equals("italic") && PositionUtils.getApproxLineCount(textBlock) < 2.9) {
|
||||
textBlock.setClassification(PageBlockType.PARAGRAPH_ITALIC);
|
||||
} else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock)) {
|
||||
textBlock.setClassification(PageBlockType.PARAGRAPH_UNKNOWN);
|
||||
} else {
|
||||
textBlock.setClassification(PageBlockType.OTHER);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
@ -0,0 +1,116 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.services.classification;
|
||||
|
||||
import java.util.List;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.Rectangle;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationDocument;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.PageBlockType;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.BodyTextFrameService;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.PositionUtils;
|
||||
|
||||
import lombok.RequiredArgsConstructor;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
|
||||
@Slf4j
|
||||
@Service
|
||||
@RequiredArgsConstructor
|
||||
public class RedactManagerClassificationService {
|
||||
|
||||
private final BodyTextFrameService bodyTextFrameService;
|
||||
|
||||
|
||||
public void classifyDocument(ClassificationDocument document) {
|
||||
|
||||
Rectangle bodyTextFrame = bodyTextFrameService.calculateBodyTextFrame(document.getPages(), document.getFontSizeCounter(), false);
|
||||
Rectangle landscapeBodyTextFrame = bodyTextFrameService.calculateBodyTextFrame(document.getPages(), document.getFontSizeCounter(), true);
|
||||
List<Float> headlineFontSizes = document.getFontSizeCounter().getHighterThanMostPopular();
|
||||
|
||||
log.debug("Document FontSize counters are: {}", document.getFontSizeCounter().getCountPerValue());
|
||||
|
||||
for (ClassificationPage page : document.getPages()) {
|
||||
bodyTextFrameService.setBodyTextFrameAdjustedToPage(page, bodyTextFrame, landscapeBodyTextFrame);
|
||||
classifyPage(page, document, headlineFontSizes);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
private void classifyPage(ClassificationPage page, ClassificationDocument document, List<Float> headlineFontSizes) {
|
||||
|
||||
for (AbstractPageBlock textBlock : page.getTextBlocks()) {
|
||||
if (textBlock instanceof TextPageBlock) {
|
||||
classifyBlock((TextPageBlock) textBlock, page, document, headlineFontSizes);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
private void classifyBlock(TextPageBlock textBlock, ClassificationPage page, ClassificationDocument document, List<Float> headlineFontSizes) {
|
||||
|
||||
var bodyTextFrame = page.getBodyTextFrame();
|
||||
|
||||
if (document.getFontSizeCounter().getMostPopular() == null) {
|
||||
textBlock.setClassification(PageBlockType.OTHER);
|
||||
return;
|
||||
}
|
||||
if (PositionUtils.isOverBodyTextFrame(bodyTextFrame, textBlock, page.getRotation()) && (document.getFontSizeCounter()
|
||||
.getMostPopular() == null || textBlock.getHighestFontSize() <= document.getFontSizeCounter().getMostPopular())) {
|
||||
textBlock.setClassification(PageBlockType.HEADER);
|
||||
|
||||
} else if (PositionUtils.isUnderBodyTextFrame(bodyTextFrame, textBlock, page.getRotation()) && (document.getFontSizeCounter()
|
||||
.getMostPopular() == null || textBlock.getHighestFontSize() <= document.getFontSizeCounter().getMostPopular())) {
|
||||
textBlock.setClassification(PageBlockType.FOOTER);
|
||||
} else if (page.getPageNumber() == 1 && (PositionUtils.getHeightDifferenceBetweenChunkWordAndDocumentWord(textBlock,
|
||||
document.getTextHeightCounter().getMostPopular()) > 2.5 && textBlock.getHighestFontSize() > document.getFontSizeCounter().getMostPopular() || page.getTextBlocks()
|
||||
.size() == 1)) {
|
||||
if (!Pattern.matches("[0-9]+", textBlock.toString())) {
|
||||
textBlock.setClassification(PageBlockType.TITLE);
|
||||
}
|
||||
} else if (textBlock.getMostPopularWordFontSize() > document.getFontSizeCounter()
|
||||
.getMostPopular() && PositionUtils.getApproxLineCount(textBlock) < 4.9 && (textBlock.getMostPopularWordStyle().equals("bold") || !document.getFontStyleCounter()
|
||||
.getCountPerValue()
|
||||
.containsKey("bold") && textBlock.getMostPopularWordFontSize() > document.getFontSizeCounter().getMostPopular() + 1) && textBlock.getSequences()
|
||||
.get(0)
|
||||
.getTextPositions()
|
||||
.get(0)
|
||||
.getFontSizeInPt() >= textBlock.getMostPopularWordFontSize()) {
|
||||
|
||||
for (int i = 1; i <= headlineFontSizes.size(); i++) {
|
||||
if (textBlock.getMostPopularWordFontSize() == headlineFontSizes.get(i - 1)) {
|
||||
textBlock.setClassification(PageBlockType.getHeadlineType(i));
|
||||
document.setHeadlines(true);
|
||||
}
|
||||
}
|
||||
} else if (!textBlock.getText().startsWith("Figure ") && PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock) && textBlock.getMostPopularWordStyle()
|
||||
.equals("bold") && !document.getFontStyleCounter().getMostPopular().equals("bold") && PositionUtils.getApproxLineCount(textBlock) < 2.9 && textBlock.getSequences()
|
||||
.get(0)
|
||||
.getTextPositions()
|
||||
.get(0)
|
||||
.getFontSizeInPt() >= textBlock.getMostPopularWordFontSize()) {
|
||||
textBlock.setClassification(PageBlockType.getHeadlineType(headlineFontSizes.size() + 1));
|
||||
document.setHeadlines(true);
|
||||
} else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock) && textBlock.getMostPopularWordFontSize() == document.getFontSizeCounter()
|
||||
.getMostPopular() && textBlock.getMostPopularWordStyle().equals("bold") && !document.getFontStyleCounter().getMostPopular().equals("bold")) {
|
||||
textBlock.setClassification(PageBlockType.PARAGRAPH_BOLD);
|
||||
} else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock) && textBlock.getMostPopularWordFont()
|
||||
.equals(document.getFontCounter().getMostPopular()) && textBlock.getMostPopularWordStyle()
|
||||
.equals(document.getFontStyleCounter().getMostPopular()) && textBlock.getMostPopularWordFontSize() == document.getFontSizeCounter().getMostPopular()) {
|
||||
textBlock.setClassification(PageBlockType.PARAGRAPH);
|
||||
} else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock) && textBlock.getMostPopularWordFontSize() == document.getFontSizeCounter()
|
||||
.getMostPopular() && textBlock.getMostPopularWordStyle().equals("italic") && !document.getFontStyleCounter()
|
||||
.getMostPopular()
|
||||
.equals("italic") && PositionUtils.getApproxLineCount(textBlock) < 2.9) {
|
||||
textBlock.setClassification(PageBlockType.PARAGRAPH_ITALIC);
|
||||
} else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock)) {
|
||||
textBlock.setClassification(PageBlockType.PARAGRAPH_UNKNOWN);
|
||||
} else {
|
||||
textBlock.setClassification(PageBlockType.OTHER);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
@ -1,4 +1,4 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.classification.service;
|
||||
package com.knecon.fforesight.service.layoutparser.processor.services.classification;
|
||||
|
||||
import java.util.List;
|
||||
import java.util.regex.Pattern;
|
||||
@ -6,12 +6,13 @@ import java.util.regex.Pattern;
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.Rectangle;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.model.AbstractPageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.model.ClassificationDocument;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.model.ClassificationPage;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.model.PageBlockType;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.model.text.TextPageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.utils.PositionUtils;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationDocument;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.PageBlockType;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.BodyTextFrameService;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.PositionUtils;
|
||||
|
||||
import lombok.RequiredArgsConstructor;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
@ -19,7 +20,7 @@ import lombok.extern.slf4j.Slf4j;
|
||||
@Slf4j
|
||||
@Service
|
||||
@RequiredArgsConstructor
|
||||
public class ClassificationService {
|
||||
public class TaasClassificationService {
|
||||
|
||||
private final BodyTextFrameService bodyTextFrameService;
|
||||
|
||||
@ -9,7 +9,7 @@
|
||||
* This program is free software under the LGPL (>=v2.1)
|
||||
* Read the file LICENSE.txt coming with the sources for details.
|
||||
*/
|
||||
package com.knecon.fforesight.service.layoutparser.processor.classification.utils;
|
||||
package com.knecon.fforesight.service.layoutparser.processor.utils;
|
||||
|
||||
import java.awt.geom.Line2D;
|
||||
import java.awt.geom.Rectangle2D;
|
||||
@ -1,4 +1,4 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.classification.utils;
|
||||
package com.knecon.fforesight.service.layoutparser.processor.utils;
|
||||
|
||||
import java.math.BigDecimal;
|
||||
import java.util.Comparator;
|
||||
@ -1,6 +1,7 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.utils;
|
||||
|
||||
import java.awt.Color;
|
||||
import java.awt.geom.Line2D;
|
||||
import java.awt.geom.Point2D;
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.io.IOException;
|
||||
@ -150,6 +151,32 @@ public class PdfVisualisationUtility {
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
public static void drawLine2DList(PDDocument pdDocument, int pageNumber, List<Line2D> line2DS, Options options) {
|
||||
|
||||
var pdPage = pdDocument.getPage(pageNumber - 1);
|
||||
var contentStream = new PDPageContentStream(pdDocument, pdPage, PDPageContentStream.AppendMode.APPEND, true);
|
||||
|
||||
contentStream.setStrokingColor(options.getStrokeColor());
|
||||
contentStream.setNonStrokingColor(options.getFillColor());
|
||||
contentStream.setLineWidth(options.getStrokeWidth());
|
||||
|
||||
for (var line2D : line2DS) {
|
||||
contentStream.moveTo((float) line2D.getX1(), (float) line2D.getY1());
|
||||
contentStream.lineTo((float) line2D.getX2(), (float) line2D.getY2());
|
||||
|
||||
if (options.isStroke() && options.isFill()) {
|
||||
contentStream.fillAndStroke();
|
||||
} else if (options.isStroke()) {
|
||||
contentStream.stroke();
|
||||
} else if (options.isFill()) {
|
||||
contentStream.fill();
|
||||
}
|
||||
}
|
||||
contentStream.close();
|
||||
}
|
||||
|
||||
|
||||
@Builder
|
||||
@Getter
|
||||
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
|
||||
|
||||
@ -1,7 +1,7 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.classification.utils;
|
||||
package com.knecon.fforesight.service.layoutparser.processor.utils;
|
||||
|
||||
import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.Rectangle;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.model.text.TextPageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
|
||||
|
||||
import lombok.experimental.UtilityClass;
|
||||
|
||||
@ -1,4 +1,4 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.classification.utils;
|
||||
package com.knecon.fforesight.service.layoutparser.processor.utils;
|
||||
|
||||
import java.util.ArrayDeque;
|
||||
import java.util.Comparator;
|
||||
@ -1,5 +1,7 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.utils;
|
||||
|
||||
import static java.lang.String.format;
|
||||
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.awt.geom.RectangularShape;
|
||||
import java.util.Collections;
|
||||
@ -23,6 +25,27 @@ import lombok.NoArgsConstructor;
|
||||
|
||||
public class RectangleTransformations {
|
||||
|
||||
public static Rectangle2D pad(Rectangle2D rectangle2D, int deltaX, int deltaY) {
|
||||
|
||||
return new Rectangle2D.Double(rectangle2D.getMinX() - deltaX, rectangle2D.getMinY() - deltaY, rectangle2D.getWidth() + 2 * deltaX, rectangle2D.getHeight() + 2 * deltaY);
|
||||
}
|
||||
|
||||
|
||||
public static Rectangle2D pad(Rectangle2D rectangle2D, double deltaX, double deltaY) {
|
||||
|
||||
return new Rectangle2D.Double(rectangle2D.getMinX() - deltaX, rectangle2D.getMinY() - deltaY, rectangle2D.getWidth() + 2 * deltaX, rectangle2D.getHeight() + 2 * deltaY);
|
||||
}
|
||||
|
||||
|
||||
public static Rectangle2D bBoxUnionAtomicTextBlock(List<AtomicTextBlock> atomicTextBlocks) {
|
||||
|
||||
return atomicTextBlocks.stream().flatMap(atomicTextBlock -> atomicTextBlock.getPositions().stream()).collect(new Rectangle2DBBoxCollector());
|
||||
}
|
||||
public static Collector<Rectangle2D, Rectangle2DBBoxCollector.BBox, Rectangle2D> collectBBox() {
|
||||
|
||||
return new Rectangle2DBBoxCollector();
|
||||
}
|
||||
|
||||
public static PDRectangle toPDRectangleBBox(List<Rectangle> rectangles) {
|
||||
|
||||
Rectangle2D rectangle2D = RectangleTransformations.rectangleBBox(rectangles);
|
||||
@ -42,6 +65,11 @@ public class RectangleTransformations {
|
||||
}
|
||||
|
||||
|
||||
public static String toString(Rectangle2D rectangle2D) {
|
||||
|
||||
return format("%f,%f,%f,%f", rectangle2D.getX(), rectangle2D.getY(), rectangle2D.getWidth(), rectangle2D.getHeight());
|
||||
}
|
||||
|
||||
public static Rectangle2D rectangleBBox(List<Rectangle> rectangles) {
|
||||
|
||||
return rectangles.stream().map(RectangleTransformations::toRectangle2D).collect(new Rectangle2DBBoxCollector());
|
||||
@ -56,6 +84,11 @@ public class RectangleTransformations {
|
||||
-redactionLogRectangle.getHeight());
|
||||
}
|
||||
|
||||
public static Rectangle2D toRectangle2D(PDRectangle rectangle) {
|
||||
|
||||
return new Rectangle2D.Double(rectangle.getLowerLeftX(), rectangle.getLowerLeftY(), rectangle.getWidth(), rectangle.getHeight());
|
||||
}
|
||||
|
||||
|
||||
public static Rectangle toRedactionLogRectangle(Rectangle2D rectangle2D, int pageNumber) {
|
||||
|
||||
|
||||
@ -1,9 +1,9 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.classification.utils;
|
||||
package com.knecon.fforesight.service.layoutparser.processor.utils;
|
||||
|
||||
import java.awt.geom.Line2D;
|
||||
import java.awt.geom.Point2D;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.model.table.Ruling;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling;
|
||||
|
||||
import lombok.experimental.UtilityClass;
|
||||
|
||||
@ -5,15 +5,18 @@ import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
import java.util.stream.Stream;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.model.AbstractPageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.model.table.Cell;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.model.table.TablePageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.Cell;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.TablePageBlock;
|
||||
|
||||
import lombok.experimental.UtilityClass;
|
||||
|
||||
@UtilityClass
|
||||
public class TableMergingUtility {
|
||||
|
||||
private static final double TABLE_ALIGNMENT_THRESHOLD = 2d;
|
||||
|
||||
|
||||
public List<TablePageBlock> findConsecutiveTablesWithSameColCountAndSameHeaders(TablePageBlock originalTablePageBlock, List<AbstractPageBlock> pageBlocks) {
|
||||
|
||||
List<TablePageBlock> consecutiveTables = pageBlocks.stream()
|
||||
@ -24,7 +27,8 @@ public class TableMergingUtility {
|
||||
|
||||
List<TablePageBlock> consecutiveTablesWithSameColCountAndHeaders = new LinkedList<>();
|
||||
for (TablePageBlock consecutiveTable : consecutiveTables) {
|
||||
if (consecutiveTable.getColCount() == originalTablePageBlock.getColCount() && !hasTableHeader(consecutiveTable)) {
|
||||
if (consecutiveTable.getColCount() == originalTablePageBlock.getColCount() && !hasTableHeader(consecutiveTable) && outerBoundaryAlignsX(originalTablePageBlock,
|
||||
consecutiveTable)) {
|
||||
consecutiveTablesWithSameColCountAndHeaders.add(consecutiveTable);
|
||||
} else {
|
||||
break;
|
||||
@ -34,6 +38,12 @@ public class TableMergingUtility {
|
||||
}
|
||||
|
||||
|
||||
private static boolean outerBoundaryAlignsX(TablePageBlock originalTablePageBlock, TablePageBlock consecutiveTable) {
|
||||
|
||||
return Math.abs(consecutiveTable.getMinX() - originalTablePageBlock.getMinX()) < TABLE_ALIGNMENT_THRESHOLD && Math.abs(consecutiveTable.getMaxX() - originalTablePageBlock.getMaxX()) < TABLE_ALIGNMENT_THRESHOLD;
|
||||
}
|
||||
|
||||
|
||||
private boolean hasTableHeader(TablePageBlock table) {
|
||||
|
||||
return table.getRows().stream().flatMap(Collection::stream).anyMatch(Cell::isHeaderCell);
|
||||
|
||||
@ -1,4 +1,4 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.classification.utils;
|
||||
package com.knecon.fforesight.service.layoutparser.processor.utils;
|
||||
|
||||
import lombok.experimental.UtilityClass;
|
||||
|
||||
@ -3,8 +3,8 @@ package com.knecon.fforesight.service.layoutparser.processor.utils;
|
||||
import java.util.Comparator;
|
||||
import java.util.List;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.model.text.TextPageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.model.text.TextPositionSequence;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
|
||||
|
||||
public class TextPositionOperations {
|
||||
|
||||
|
||||
@ -0,0 +1,72 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package com.knecon.fforesight.service.layoutparser.processor.utils;
|
||||
|
||||
import java.util.Comparator;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
|
||||
|
||||
/**
|
||||
* This class is a comparator for TextPosition operators. It handles
|
||||
* pages with text in different directions by grouping the text based
|
||||
* on direction and sorting in that direction. This allows continuous text
|
||||
* in a given direction to be more easily grouped together.
|
||||
*
|
||||
* @author Ben Litchfield
|
||||
*/
|
||||
public class TextPositionSequenceComparator implements Comparator<TextPositionSequence>
|
||||
{
|
||||
@Override
|
||||
public int compare(TextPositionSequence pos1, TextPositionSequence pos2)
|
||||
{
|
||||
// only compare text that is in the same direction
|
||||
int cmp1 = Float.compare(pos1.getDir().getDegrees(), pos2.getDir().getDegrees());
|
||||
if (cmp1 != 0)
|
||||
{
|
||||
return cmp1;
|
||||
}
|
||||
|
||||
// get the text direction adjusted coordinates
|
||||
float x1 = pos1.getMinXDirAdj();
|
||||
float x2 = pos2.getMinXDirAdj();
|
||||
|
||||
float pos1YBottom = pos1.getMaxYDirAdj();
|
||||
float pos2YBottom = pos2.getMaxYDirAdj();
|
||||
|
||||
// note that the coordinates have been adjusted so 0,0 is in upper left
|
||||
float pos1YTop = pos1YBottom - pos1.getTextHeight();
|
||||
float pos2YTop = pos2YBottom - pos2.getTextHeight();
|
||||
|
||||
float yDifference = Math.abs(pos1YBottom - pos2YBottom);
|
||||
|
||||
// we will do a simple tolerance comparison
|
||||
if (yDifference < .1 ||
|
||||
pos2YBottom >= pos1YTop && pos2YBottom <= pos1YBottom ||
|
||||
pos1YBottom >= pos2YTop && pos1YBottom <= pos2YBottom)
|
||||
{
|
||||
return Float.compare(x1, x2);
|
||||
}
|
||||
else if (pos1YBottom < pos2YBottom)
|
||||
{
|
||||
return -1;
|
||||
}
|
||||
else
|
||||
{
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -0,0 +1,16 @@
|
||||
<Configuration>
|
||||
|
||||
<Appenders>
|
||||
<Console name="CONSOLE" target="SYSTEM_OUT">
|
||||
<PatternLayout pattern="%d{HH:mm:ss.SSS} [%t] %-5level %logger{36} - %msg%n"/>
|
||||
</Console>
|
||||
</Appenders>
|
||||
|
||||
<Loggers>
|
||||
<Root level="warn">
|
||||
<AppenderRef ref="CONSOLE"/>
|
||||
</Root>
|
||||
<Logger name="com.iqser" level="info"/>
|
||||
</Loggers>
|
||||
|
||||
</Configuration>
|
||||
@ -20,7 +20,8 @@ import org.springframework.beans.factory.annotation.Autowired;
|
||||
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.taas.ResearchDocumentData;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.LayoutParsingService;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.LayoutParsingPipeline;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.adapter.model.image.ImageServiceResponse;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.adapter.model.table.TableServiceResponse;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.Document;
|
||||
@ -37,7 +38,7 @@ public class BdrJsonBuildTest extends BaseTest {
|
||||
private ObjectMapper objectMapper;
|
||||
|
||||
@Autowired
|
||||
private LayoutParsingService layoutParsingService;
|
||||
private LayoutParsingPipeline layoutParsingPipeline;
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
@ -45,7 +46,7 @@ public class BdrJsonBuildTest extends BaseTest {
|
||||
|
||||
try (InputStream inputStream = new FileInputStream(filename)) {
|
||||
PDDocument pdDocument = Loader.loadPDF(inputStream);
|
||||
return layoutParsingService.parseLayoutWithTimer(pdDocument, new ImageServiceResponse(), new TableServiceResponse());
|
||||
return layoutParsingPipeline.parseLayoutWithTimer(LayoutParsingType.REDACT_MANAGER, pdDocument, new ImageServiceResponse(), new TableServiceResponse());
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@ -11,7 +11,8 @@ import org.junit.jupiter.api.Test;
|
||||
import org.springframework.beans.factory.annotation.Autowired;
|
||||
import org.springframework.core.io.ClassPathResource;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.LayoutParsingService;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.LayoutParsingPipeline;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.adapter.model.table.TableServiceResponse;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.Document;
|
||||
import com.knecon.fforesight.service.layoutparser.server.utils.BaseTest;
|
||||
@ -21,7 +22,7 @@ import lombok.SneakyThrows;
|
||||
public class BuildDocumentGraphTest extends BaseTest {
|
||||
|
||||
@Autowired
|
||||
private LayoutParsingService layoutParsingService;
|
||||
private LayoutParsingPipeline layoutParsingPipeline;
|
||||
|
||||
@Test
|
||||
@Disabled
|
||||
@ -50,7 +51,7 @@ public class BuildDocumentGraphTest extends BaseTest {
|
||||
|
||||
try (InputStream inputStream = fileResource.getInputStream()) {
|
||||
PDDocument pdDocument = Loader.loadPDF(inputStream);
|
||||
return layoutParsingService.parseLayout(pdDocument, layoutParsingStorageService.getImagesFile(IMAGE_FILE_ID), new TableServiceResponse());
|
||||
return layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER, pdDocument, layoutParsingStorageService.getImagesFile(IMAGE_FILE_ID), new TableServiceResponse());
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@ -25,7 +25,8 @@ public class DocumentGraphMappingTest extends BuildDocumentGraphTest {
|
||||
DocumentData documentData = DocumentDataMapper.toDocumentData(document);
|
||||
var researchDocumentData = TaasDocumentDataMapper.fromDocument(document);
|
||||
|
||||
layoutParsingStorageService.storeDocumentData(layoutParsingRequest, researchDocumentData, documentData);
|
||||
layoutParsingStorageService.storeDocumentData(layoutParsingRequest, documentData);
|
||||
|
||||
DocumentData documentData2 = layoutParsingStorageService.readDocumentData(layoutParsingRequest);
|
||||
Document newDocumentGraph = DocumentGraphMapper.toDocumentGraph(documentData2);
|
||||
|
||||
|
||||
@ -0,0 +1,58 @@
|
||||
package com.knecon.fforesight.service.layoutparser.server.model;
|
||||
|
||||
import static org.junit.jupiter.api.Assertions.assertFalse;
|
||||
import static org.junit.jupiter.api.Assertions.assertTrue;
|
||||
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.SectionIdentifier;
|
||||
|
||||
class SectionIdentifierTest {
|
||||
|
||||
@Test
|
||||
public void testParentOf() {
|
||||
|
||||
var headline = SectionIdentifier.fromSearchText("1 Did you ever hear the tragedy of Darth Plagueis The Wise?");
|
||||
var headline1 = SectionIdentifier.fromSearchText("1.0 I thought not. It’s not a story the Jedi would tell you.");
|
||||
var headline2 = SectionIdentifier.fromSearchText("1.1 It’s a Sith legend. Darth Plagueis was a Dark Lord of the Sith, ");
|
||||
var headline3 = SectionIdentifier.fromSearchText("1.2.3 so powerful and so wise he could use the Force to influence the midichlorians to create life…");
|
||||
var headline4 = SectionIdentifier.fromSearchText("1.2.3.4 He had such a knowledge of the dark side that he could even keep the ones he cared about from dying.");
|
||||
var headline5 = SectionIdentifier.fromSearchText("1.2.3.4.5 The dark side of the Force is a pathway to many abilities some consider to be unnatural.");
|
||||
var headline6 = SectionIdentifier.fromSearchText("2.0 He became so powerful…");
|
||||
var headline7 = SectionIdentifier.fromSearchText("10000.0 the only thing he was afraid of was losing his power,");
|
||||
var headline8 = SectionIdentifier.fromSearchText("A.0 which eventually, of course, he did.");
|
||||
var headline9 = SectionIdentifier.fromSearchText("Unfortunately, he taught his apprentice everything he knew, then his apprentice killed him in his sleep.");
|
||||
var headline10 = SectionIdentifier.fromSearchText("2.1.2 Ironic.");
|
||||
var headline11 = SectionIdentifier.fromSearchText("2.He could save others from death,");
|
||||
var headline12 = SectionIdentifier.fromSearchText(" 2. but not himself.");
|
||||
|
||||
var paragraph1 = SectionIdentifier.asChildOf(headline);
|
||||
assertTrue(paragraph1.isChildOf(headline));
|
||||
assertTrue(headline.isParentOf(paragraph1));
|
||||
assertFalse(paragraph1.isParentOf(headline));
|
||||
|
||||
assertFalse(headline.isParentOf(headline1));
|
||||
assertTrue(headline.isParentOf(headline2));
|
||||
assertTrue(headline.isParentOf(headline3));
|
||||
assertTrue(headline.isParentOf(headline4));
|
||||
assertTrue(headline.isParentOf(headline5));
|
||||
assertTrue(headline1.isParentOf(headline2));
|
||||
assertFalse(headline1.isParentOf(headline1));
|
||||
assertTrue(headline3.isParentOf(headline4));
|
||||
assertFalse(headline4.isParentOf(headline5));
|
||||
assertFalse(headline2.isParentOf(headline3));
|
||||
assertFalse(headline2.isParentOf(headline4));
|
||||
assertTrue(headline1.isParentOf(headline3));
|
||||
assertTrue(headline1.isParentOf(headline4));
|
||||
assertFalse(headline1.isParentOf(headline6));
|
||||
assertFalse(headline1.isParentOf(headline7));
|
||||
assertFalse(headline8.isParentOf(headline1));
|
||||
assertFalse(headline8.isParentOf(headline2));
|
||||
assertFalse(headline8.isParentOf(headline3));
|
||||
assertFalse(headline8.isParentOf(headline4));
|
||||
assertFalse(headline9.isParentOf(headline9));
|
||||
assertTrue(headline10.isChildOf(headline11));
|
||||
assertTrue(headline10.isChildOf(headline12));
|
||||
}
|
||||
|
||||
}
|
||||
@ -0,0 +1,71 @@
|
||||
package com.knecon.fforesight.service.layoutparser.server.services;
|
||||
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.GapInformation;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.PageContents;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.PageInformation;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.DividingColumnDetectionService;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.GapDetectionService;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.GapsAcrossLinesService;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.PageInformationService;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.TextPositionSequenceSorter;
|
||||
import com.knecon.fforesight.service.layoutparser.server.utils.visualizations.PdfDraw;
|
||||
|
||||
import lombok.SneakyThrows;
|
||||
|
||||
class ColumnDetectionServiceTest {
|
||||
|
||||
@Test
|
||||
@SneakyThrows
|
||||
public void testGapBasedColumnDetection() {
|
||||
|
||||
String filename = "files/Metolachlor/S-Metolachlor_RAR_01_Volume_1_2018-09-06.pdf";
|
||||
var tmpFileName = "/tmp/" + filename.split("/")[2] + "_COLUMNS.pdf";
|
||||
System.out.println("start TextPosition extraction");
|
||||
long start = System.currentTimeMillis();
|
||||
List<PageInformation> pageInformations = TextPositionSequenceSorter.getSortedTextPositionsWithPages(filename).stream().map(PageInformationService::build).toList();
|
||||
List<List<Rectangle2D>> columnsPerPage = new LinkedList<>();
|
||||
System.out.printf("Finished TextPosition Extraction in %d ms%n", System.currentTimeMillis() - start);
|
||||
System.out.println("start column detection");
|
||||
start = System.currentTimeMillis();
|
||||
for (PageInformation pageInformation : pageInformations) {
|
||||
GapInformation gapInformation = GapDetectionService.findGapsInLines(pageInformation.getPageContents().getSortedTextPositionSequences(), pageInformation.getMainBodyTextFrame());
|
||||
columnsPerPage.add(GapsAcrossLinesService.detectXGapsAcrossLines(gapInformation, pageInformation.getMainBodyTextFrame()));
|
||||
}
|
||||
System.out.printf("Finished column detection in %d ms%n", System.currentTimeMillis() - start);
|
||||
System.out.println("start draw rectangles");
|
||||
start = System.currentTimeMillis();
|
||||
PdfDraw.drawRectanglesPerPage(filename, columnsPerPage, tmpFileName);
|
||||
System.out.printf("Finished drawing rectangles in %d ms%n", System.currentTimeMillis() - start);
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
@SneakyThrows
|
||||
public void testColumnDetection() {
|
||||
|
||||
String filename = "files/Metolachlor/S-Metolachlor_RAR_01_Volume_1_2018-09-06.pdf";
|
||||
var tmpFileName = "/tmp/" + filename.split("/")[2] + "_COLUMNS.pdf";
|
||||
System.out.println("start TextPosition extraction");
|
||||
long start = System.currentTimeMillis();
|
||||
List<PageContents> sortedTextPositionSequencesPerPage = TextPositionSequenceSorter.getSortedTextPositionsWithPages(filename);
|
||||
List<List<Rectangle2D>> columnsPerPage = new LinkedList<>();
|
||||
System.out.printf("Finished TextPosition Extraction in %d ms%n", System.currentTimeMillis() - start);
|
||||
System.out.println("start column detection");
|
||||
start = System.currentTimeMillis();
|
||||
for (PageContents pageContents : sortedTextPositionSequencesPerPage) {
|
||||
columnsPerPage.add(DividingColumnDetectionService.detectColumns(pageContents));
|
||||
}
|
||||
System.out.printf("Finished column detection in %d ms%n", System.currentTimeMillis() - start);
|
||||
System.out.println("start draw rectangles");
|
||||
start = System.currentTimeMillis();
|
||||
PdfDraw.drawRectanglesPerPage(filename, columnsPerPage, tmpFileName);
|
||||
System.out.printf("Finished drawing rectangles in %d ms%n", System.currentTimeMillis() - start);
|
||||
}
|
||||
|
||||
}
|
||||
@ -0,0 +1,23 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.services;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.PageContents;
|
||||
|
||||
import lombok.SneakyThrows;
|
||||
|
||||
class InvisibleTableDetectionServiceTest {
|
||||
|
||||
|
||||
@Test
|
||||
@SneakyThrows
|
||||
public void detectInvisibleTableTest() {
|
||||
|
||||
String fileName = "files/test-two-pages_ocred-2.pdf";
|
||||
|
||||
List<PageContents> pageContents = TextPositionSequenceSorter.getSortedTextPositionsWithPages(fileName);
|
||||
|
||||
}
|
||||
}
|
||||
@ -0,0 +1,7 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.services;
|
||||
|
||||
import static org.junit.jupiter.api.Assertions.*;
|
||||
|
||||
class MainBodyTextFrameExtractionServiceTest {
|
||||
|
||||
}
|
||||
@ -0,0 +1,50 @@
|
||||
package com.knecon.fforesight.service.layoutparser.server.services;
|
||||
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.util.Collection;
|
||||
import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
|
||||
import org.junit.jupiter.api.Disabled;
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.GapInformation;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.PageContents;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.GapDetectionService;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.MainBodyTextFrameExtractionService;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.TextPositionSequenceSorter;
|
||||
import com.knecon.fforesight.service.layoutparser.server.utils.visualizations.PdfDraw;
|
||||
|
||||
import lombok.SneakyThrows;
|
||||
|
||||
class GapDetectionServiceTest {
|
||||
|
||||
@Test
|
||||
@Disabled
|
||||
@SneakyThrows
|
||||
public void testGapDetection() {
|
||||
|
||||
String filename = "files/Metolachlor/S-Metolachlor_RAR_01_Volume_1_2018-09-06.pdf";
|
||||
var tmpFileName = "/tmp/" + filename.split("/")[2] + "_GAPS.pdf";
|
||||
System.out.println("start TextPosition extraction");
|
||||
long start = System.currentTimeMillis();
|
||||
List<PageContents> sortedTextPositionSequencesPerPage = TextPositionSequenceSorter.getSortedTextPositionsWithPages(filename);
|
||||
List<GapInformation> gapInformationInLinesPerPage = new LinkedList<>();
|
||||
System.out.printf("Finished TextPosition Extraction in %d ms%n", System.currentTimeMillis() - start);
|
||||
System.out.println("start gap detection");
|
||||
start = System.currentTimeMillis();
|
||||
for (PageContents pageContents : sortedTextPositionSequencesPerPage) {
|
||||
// List<List<TextPositionSequence>> lines = LineDetectionService.orderByLine(textPositionsWithPage.getSortedTextPositionSequences());
|
||||
Rectangle2D mainBodyTextFrame = MainBodyTextFrameExtractionService.calculateMainBodyTextFrame(pageContents);
|
||||
gapInformationInLinesPerPage.add(GapDetectionService.findGapsInLines(pageContents.getSortedTextPositionSequences(), mainBodyTextFrame));
|
||||
}
|
||||
System.out.printf("Finished gap detection in %d ms%n", System.currentTimeMillis() - start);
|
||||
System.out.println("start draw rectangles");
|
||||
start = System.currentTimeMillis();
|
||||
PdfDraw.drawRectanglesAndLinesPerPage(filename,
|
||||
gapInformationInLinesPerPage.stream().map(gaps -> gaps.getYGaps().stream().flatMap(Collection::stream).toList()).toList(),
|
||||
gapInformationInLinesPerPage.stream().map(gaps -> gaps.getXGaps().stream().flatMap(Collection::stream).toList()).toList(), tmpFileName);
|
||||
System.out.printf("Finished drawing rectangles in %d ms%n", System.currentTimeMillis() - start);
|
||||
}
|
||||
|
||||
}
|
||||
@ -0,0 +1,39 @@
|
||||
package com.knecon.fforesight.service.layoutparser.server.services;
|
||||
|
||||
import java.nio.file.Path;
|
||||
import java.util.List;
|
||||
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.PageContents;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.RectangleTransformations;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.TextPositionSequenceSorter;
|
||||
import com.knecon.fforesight.service.layoutparser.server.utils.visualizations.PdfDraw;
|
||||
|
||||
import lombok.SneakyThrows;
|
||||
|
||||
class TextPositionSequenceSorterTest {
|
||||
|
||||
@Test
|
||||
@SneakyThrows
|
||||
public void testTextPositionSequenceExtraction() {
|
||||
|
||||
String fileName = "files/new/test-two-pages_ocred-2.pdf";
|
||||
var tmpFileName = Path.of("/tmp/").resolve(Path.of(fileName).getFileName() + "_TEXT_POSITION_SEQUENCES.pdf").toString();
|
||||
|
||||
List<PageContents> textPositionPerPage = TextPositionSequenceSorter.getSortedTextPositionsWithPages(fileName);
|
||||
|
||||
PdfDraw.drawRectanglesPerPageNumberedByLine(fileName,
|
||||
textPositionPerPage.stream()
|
||||
.map(t -> t.getSortedTextPositionSequences()
|
||||
.stream()
|
||||
.map(TextPositionSequence::getRectangle)
|
||||
.map(RectangleTransformations::toRectangle2D)
|
||||
//.map(textPositionSequence -> (Rectangle2D) new Rectangle2D.Double(textPositionSequence.getMaxXDirAdj(), textPositionSequence.getMaxYDirAdj(), textPositionSequence.getWidth(), textPositionSequence.getHeight()))
|
||||
.map(List::of)
|
||||
.toList())
|
||||
.toList(), tmpFileName);
|
||||
}
|
||||
|
||||
}
|
||||
@ -3,16 +3,20 @@ package com.knecon.fforesight.service.layoutparser.server.utils.visualizations;
|
||||
import java.awt.Color;
|
||||
import java.awt.geom.Point2D;
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.io.FileOutputStream;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
import org.apache.pdfbox.Loader;
|
||||
import org.apache.pdfbox.pdmodel.PDDocument;
|
||||
import org.apache.pdfbox.pdmodel.PDPage;
|
||||
import org.apache.pdfbox.pdmodel.PDPageContentStream;
|
||||
import org.apache.pdfbox.pdmodel.font.PDType1Font;
|
||||
import org.apache.pdfbox.pdmodel.font.Standard14Fonts;
|
||||
import org.apache.pdfbox.util.Matrix;
|
||||
import org.springframework.core.io.ClassPathResource;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeType;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.graph.DocumentTree;
|
||||
@ -20,7 +24,8 @@ import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.Document
|
||||
import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.Page;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.graph.textblock.AtomicTextBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.graph.textblock.TextBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.RectangleTransformations;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.PdfVisualisationUtility;
|
||||
|
||||
import lombok.AccessLevel;
|
||||
import lombok.AllArgsConstructor;
|
||||
@ -34,6 +39,68 @@ import lombok.experimental.UtilityClass;
|
||||
@UtilityClass
|
||||
public class PdfDraw {
|
||||
|
||||
public static void drawRectanglesPerPage(String filename, List<List<Rectangle2D>> rectanglesPerPage, String tmpFileName) throws IOException {
|
||||
|
||||
try (InputStream inputStream = new ClassPathResource(filename).getInputStream()) {
|
||||
PDDocument pdDocument = Loader.loadPDF(inputStream);
|
||||
|
||||
for (int pageNumber = 1; pageNumber < pdDocument.getNumberOfPages() + 1; pageNumber++) {
|
||||
PdfVisualisationUtility.drawRectangle2DList(pdDocument,
|
||||
pageNumber,
|
||||
rectanglesPerPage.get(pageNumber - 1),
|
||||
PdfVisualisationUtility.Options.builder().stroke(true).build());
|
||||
}
|
||||
try (var out = new FileOutputStream(tmpFileName)) {
|
||||
pdDocument.save(out);
|
||||
pdDocument.close();
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
public static void drawRectanglesPerPageNumberedByLine(String filename, List<List<List<Rectangle2D>>> rectanglesPerPage, String tmpFileName) throws IOException {
|
||||
|
||||
try (InputStream inputStream = new ClassPathResource(filename).getInputStream()) {
|
||||
PDDocument pdDocument = Loader.loadPDF(inputStream);
|
||||
|
||||
for (int pageNumber = 1; pageNumber < pdDocument.getNumberOfPages() + 1; pageNumber++) {
|
||||
var rectanglesOnPage = rectanglesPerPage.get(pageNumber - 1);
|
||||
for (int lineNumber = 0; lineNumber < rectanglesOnPage.size(); lineNumber++) {
|
||||
var rectanglesInLine = rectanglesOnPage.get(lineNumber);
|
||||
PdfVisualisationUtility.drawRectangle2DList(pdDocument, pageNumber, rectanglesInLine, PdfVisualisationUtility.Options.builder().stroke(true).build());
|
||||
double y = Math.min(rectanglesInLine.get(0).getMinY(), rectanglesInLine.get(0).getMaxY());
|
||||
PdfVisualisationUtility.drawText(String.format("%d", lineNumber),
|
||||
pdDocument,
|
||||
new Point2D.Double(rectanglesInLine.get(0).getX() - (5 + (5 * countNumberOfDigits(lineNumber))), y + 2),
|
||||
pageNumber,
|
||||
PdfVisualisationUtility.Options.builder().stroke(true).build());
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
try (var out = new FileOutputStream(tmpFileName)) {
|
||||
pdDocument.save(out);
|
||||
pdDocument.close();
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
private static int countNumberOfDigits(int num) {
|
||||
|
||||
if (num == 0) {
|
||||
return 1;
|
||||
}
|
||||
int count = 0;
|
||||
for (; num != 0; num /= 10, ++count) {
|
||||
}
|
||||
return count;
|
||||
}
|
||||
|
||||
public static void drawDocumentGraph(PDDocument document, Document documentGraph) {
|
||||
|
||||
documentGraph.getDocumentTree().allEntriesInOrder().forEach(entry -> drawNode(document, entry));
|
||||
@ -115,6 +182,35 @@ public class PdfDraw {
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
public static void drawRectanglesAndLinesPerPage(String filename, List<List<Rectangle2D>> list, List<List<Rectangle2D>> rectanglesPerPage, String tmpFileName) {
|
||||
|
||||
try (InputStream inputStream = new ClassPathResource(filename).getInputStream()) {
|
||||
PDDocument pdDocument = Loader.loadPDF(inputStream);
|
||||
|
||||
for (int pageNumber = 1; pageNumber < pdDocument.getNumberOfPages() + 1; pageNumber++) {
|
||||
// PdfVisualisationUtility.drawLine2DList(pdDocument,
|
||||
// pageNumber,
|
||||
// list.get(pageNumber - 1),
|
||||
// PdfVisualisationUtility.Options.builder().stroke(true).build());
|
||||
PdfVisualisationUtility.drawRectangle2DList(pdDocument,
|
||||
pageNumber,
|
||||
rectanglesPerPage.get(pageNumber - 1),
|
||||
PdfVisualisationUtility.Options.builder().stroke(true).build());
|
||||
PdfVisualisationUtility.drawRectangle2DList(pdDocument,
|
||||
pageNumber,
|
||||
list.get(pageNumber - 1),
|
||||
PdfVisualisationUtility.Options.builder().stroke(true).build());
|
||||
}
|
||||
try (var out = new FileOutputStream(tmpFileName)) {
|
||||
pdDocument.save(out);
|
||||
pdDocument.close();
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@Builder
|
||||
@AllArgsConstructor
|
||||
@NoArgsConstructor
|
||||
|
||||
@ -0,0 +1,37 @@
|
||||
info:
|
||||
description: Layout Parser Service Processor
|
||||
|
||||
tenant-user-management-service.url: "http://tenant-user-management-service:8080/internal"
|
||||
fforesight.tenants.remote: true
|
||||
|
||||
server:
|
||||
port: 8080
|
||||
|
||||
spring:
|
||||
main:
|
||||
allow-circular-references: true # FIXME
|
||||
rabbitmq:
|
||||
host: ${RABBITMQ_HOST:localhost}
|
||||
port: ${RABBITMQ_PORT:5672}
|
||||
username: ${RABBITMQ_USERNAME:user}
|
||||
password: ${RABBITMQ_PASSWORD:rabbitmq}
|
||||
listener:
|
||||
simple:
|
||||
acknowledge-mode: AUTO
|
||||
concurrency: 2
|
||||
retry:
|
||||
enabled: true
|
||||
max-attempts: 3
|
||||
max-interval: 15000
|
||||
prefetch: 1
|
||||
|
||||
management:
|
||||
endpoint:
|
||||
metrics.enabled: ${monitoring.enabled:false}
|
||||
prometheus.enabled: ${monitoring.enabled:false}
|
||||
health.enabled: true
|
||||
endpoints.web.exposure.include: prometheus, health
|
||||
|
||||
|
||||
storage:
|
||||
backend: 's3'
|
||||
@ -0,0 +1,16 @@
|
||||
<Configuration>
|
||||
|
||||
<Appenders>
|
||||
<Console name="CONSOLE" target="SYSTEM_OUT">
|
||||
<PatternLayout pattern="%d{HH:mm:ss.SSS} [%t] %-5level %logger{36} - %msg%n"/>
|
||||
</Console>
|
||||
</Appenders>
|
||||
|
||||
<Loggers>
|
||||
<Root level="warn">
|
||||
<AppenderRef ref="CONSOLE"/>
|
||||
</Root>
|
||||
<Logger name="com.iqser" level="info"/>
|
||||
</Loggers>
|
||||
|
||||
</Configuration>
|
||||
@ -7,6 +7,7 @@
|
||||
<groupId>org.springframework.boot</groupId>
|
||||
<artifactId>spring-boot-starter-parent</artifactId>
|
||||
<version>3.0.6</version>
|
||||
<relativePath></relativePath>
|
||||
</parent>
|
||||
|
||||
<groupId>com.knecon.fforesight</groupId>
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user