Merge branch 'TAAS-41' into 'main'
TAAS-41: TAAS Document Structure See merge request fforesight/layout-parser!2
This commit is contained in:
commit
653f280fd1
@ -1,4 +1,4 @@
|
||||
package com.knecon.fforesight.service.layoutparser.internal.api.data;
|
||||
package com.knecon.fforesight.service.layoutparser.internal.api.data.redaction;
|
||||
|
||||
import lombok.AccessLevel;
|
||||
import lombok.AllArgsConstructor;
|
||||
@ -1,4 +1,4 @@
|
||||
package com.knecon.fforesight.service.layoutparser.internal.api.data;
|
||||
package com.knecon.fforesight.service.layoutparser.internal.api.data.redaction;
|
||||
|
||||
|
||||
|
||||
@ -1,4 +1,4 @@
|
||||
package com.knecon.fforesight.service.layoutparser.internal.api.data;
|
||||
package com.knecon.fforesight.service.layoutparser.internal.api.data.redaction;
|
||||
|
||||
import lombok.AccessLevel;
|
||||
import lombok.AllArgsConstructor;
|
||||
@ -1,4 +1,4 @@
|
||||
package com.knecon.fforesight.service.layoutparser.internal.api.data;
|
||||
package com.knecon.fforesight.service.layoutparser.internal.api.data.redaction;
|
||||
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
@ -1,4 +1,4 @@
|
||||
package com.knecon.fforesight.service.layoutparser.internal.api.data;
|
||||
package com.knecon.fforesight.service.layoutparser.internal.api.data.redaction;
|
||||
|
||||
import java.util.Locale;
|
||||
|
||||
@ -1,4 +1,4 @@
|
||||
package com.knecon.fforesight.service.layoutparser.internal.api.data;
|
||||
package com.knecon.fforesight.service.layoutparser.internal.api.data.redaction;
|
||||
|
||||
import lombok.AccessLevel;
|
||||
import lombok.AllArgsConstructor;
|
||||
@ -0,0 +1,21 @@
|
||||
package com.knecon.fforesight.service.layoutparser.internal.api.data.taas;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
import lombok.Builder;
|
||||
import lombok.Data;
|
||||
|
||||
@Data
|
||||
@Builder
|
||||
public class ParagraphData {
|
||||
|
||||
private String text;
|
||||
List<Range> boldTextBoundaries;
|
||||
List<Range> italicTextBoundaries;
|
||||
List<Integer> linebreaks;
|
||||
private String classification;
|
||||
|
||||
private String orientation;
|
||||
private int textDirection;
|
||||
|
||||
}
|
||||
@ -0,0 +1,5 @@
|
||||
package com.knecon.fforesight.service.layoutparser.internal.api.data.taas;
|
||||
|
||||
public record Range(int start, int end) {
|
||||
|
||||
}
|
||||
@ -0,0 +1,16 @@
|
||||
package com.knecon.fforesight.service.layoutparser.internal.api.data.taas;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Builder;
|
||||
import lombok.Data;
|
||||
|
||||
@Builder
|
||||
@Data
|
||||
@AllArgsConstructor
|
||||
public class ResearchDocumentData {
|
||||
|
||||
String originalFile;
|
||||
List<StructureObject> structureObjects;
|
||||
}
|
||||
@ -0,0 +1,15 @@
|
||||
package com.knecon.fforesight.service.layoutparser.internal.api.data.taas;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Data;
|
||||
|
||||
@Data
|
||||
@AllArgsConstructor
|
||||
public class RowData {
|
||||
|
||||
boolean header;
|
||||
List<ParagraphData> cellText;
|
||||
float[] bBox;
|
||||
}
|
||||
@ -0,0 +1,19 @@
|
||||
package com.knecon.fforesight.service.layoutparser.internal.api.data.taas;
|
||||
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Builder;
|
||||
import lombok.Data;
|
||||
|
||||
@Data
|
||||
@Builder
|
||||
@AllArgsConstructor
|
||||
public class StructureObject {
|
||||
|
||||
Integer structureObjectNumber;
|
||||
int page;
|
||||
int stringOffset;
|
||||
float[] boundingBox;
|
||||
ParagraphData paragraph;
|
||||
TableData table;
|
||||
|
||||
}
|
||||
@ -0,0 +1,15 @@
|
||||
package com.knecon.fforesight.service.layoutparser.internal.api.data.taas;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Data;
|
||||
|
||||
@Data
|
||||
@AllArgsConstructor
|
||||
public class TableData {
|
||||
|
||||
List<RowData> rowData;
|
||||
Integer numberOfCols;
|
||||
Integer numberOfRows;
|
||||
}
|
||||
@ -2,7 +2,7 @@ package com.knecon.fforesight.service.layoutparser.internal.api.queue;
|
||||
|
||||
public class LayoutParsingQueueNames {
|
||||
|
||||
public static final String LAYOUT_PARSING_REQUEST_QUEUE = "LAYOUTPARSING_REQUEST_QUEUE";
|
||||
public static final String LAYOUT_PARSING_DLQ = "LAYOUTPARSING_DLQ";
|
||||
public static final String LAYOUT_PARSING_FINISHED_EVENT_QUEUE = "LAYOUTPARSING_FINISHED_EVENT_QUEUE";
|
||||
public static final String LAYOUT_PARSING_REQUEST_QUEUE = "layout_parsing_request_queue";
|
||||
public static final String LAYOUT_PARSING_DLQ = "layout_parsing_dead_letter_queue";
|
||||
public static final String LAYOUT_PARSING_FINISHED_EVENT_QUEUE = "layout_parsing_response_queue";
|
||||
}
|
||||
|
||||
@ -8,13 +8,16 @@ import lombok.Builder;
|
||||
|
||||
@Builder
|
||||
public record LayoutParsingRequest(
|
||||
LayoutParsingType layoutParsingType,
|
||||
Map<String, String> identifier,
|
||||
String originFileStorageId,
|
||||
Optional<String> tablesFileStorageId,
|
||||
Optional<String> imagesFileStorageId,
|
||||
String structureFileStorageId,
|
||||
String researchDocumentStorageId,
|
||||
String textBlockFileStorageId,
|
||||
String positionBlockFileStorageId,
|
||||
String pageFileStorageId) {
|
||||
String pageFileStorageId,
|
||||
String sectionGridStorageId) {
|
||||
|
||||
}
|
||||
|
||||
@ -0,0 +1,7 @@
|
||||
package com.knecon.fforesight.service.layoutparser.internal.api.queue;
|
||||
|
||||
public enum LayoutParsingType {
|
||||
REDACT_MANAGER,
|
||||
TAAS,
|
||||
DOCUMINE
|
||||
}
|
||||
@ -0,0 +1,16 @@
|
||||
<Configuration>
|
||||
|
||||
<Appenders>
|
||||
<Console name="CONSOLE" target="SYSTEM_OUT">
|
||||
<PatternLayout pattern="%d{HH:mm:ss.SSS} [%t] %-5level %logger{36} - %msg%n"/>
|
||||
</Console>
|
||||
</Appenders>
|
||||
|
||||
<Loggers>
|
||||
<Root level="warn">
|
||||
<AppenderRef ref="CONSOLE"/>
|
||||
</Root>
|
||||
<Logger name="com.iqser" level="info"/>
|
||||
</Loggers>
|
||||
|
||||
</Configuration>
|
||||
@ -60,6 +60,12 @@
|
||||
<groupId>org.springframework.boot</groupId>
|
||||
<artifactId>spring-boot-starter-amqp</artifactId>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.junit.jupiter</groupId>
|
||||
<artifactId>junit-jupiter</artifactId>
|
||||
<version>RELEASE</version>
|
||||
<scope>test</scope>
|
||||
</dependency>
|
||||
</dependencies>
|
||||
|
||||
</project>
|
||||
|
||||
@ -0,0 +1,141 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor;
|
||||
|
||||
import static java.lang.String.format;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
import org.apache.pdfbox.pdmodel.PDDocument;
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingFinishedEvent;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingRequest;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.adapter.CvTableParsingAdapter;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.adapter.ImageServiceResponseAdapter;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.adapter.model.image.ImageServiceResponse;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.adapter.model.table.TableServiceResponse;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.factory.DocumentGraphFactory;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.Document;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.mapper.redaction.DocumentDataMapper;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.mapper.taas.TaasDocumentDataMapper;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationDocument;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.PdfParsingService;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.SectionGridCreatorService;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.SectionsBuilderService;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.classification.DocuMineClassificationService;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.classification.RedactManagerClassificationService;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.classification.TaasClassificationService;
|
||||
|
||||
import lombok.RequiredArgsConstructor;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
|
||||
@Slf4j
|
||||
@Service
|
||||
@RequiredArgsConstructor
|
||||
public class LayoutParsingPipeline {
|
||||
|
||||
private final ImageServiceResponseAdapter imageServiceResponseAdapter;
|
||||
private final CvTableParsingAdapter cvTableParsingAdapter;
|
||||
private final LayoutParsingStorageService layoutParsingStorageService;
|
||||
private final PdfParsingService pdfParsingService;
|
||||
private final SectionsBuilderService sectionsBuilderService;
|
||||
private final SectionGridCreatorService sectionGridCreatorService;
|
||||
private final TaasClassificationService taasClassificationService;
|
||||
private final RedactManagerClassificationService redactManagerClassificationService;
|
||||
private final DocuMineClassificationService docuMineClassificationService;
|
||||
|
||||
|
||||
public LayoutParsingFinishedEvent parseLayoutAndSaveFilesToStorage(LayoutParsingRequest layoutParsingRequest) throws IOException {
|
||||
|
||||
long start = System.currentTimeMillis();
|
||||
PDDocument originDocument = layoutParsingStorageService.getOriginFile(layoutParsingRequest.originFileStorageId());
|
||||
|
||||
ImageServiceResponse imageServiceResponse = new ImageServiceResponse();
|
||||
if (layoutParsingRequest.imagesFileStorageId().isPresent()) {
|
||||
imageServiceResponse = layoutParsingStorageService.getImagesFile(layoutParsingRequest.pageFileStorageId());
|
||||
}
|
||||
|
||||
TableServiceResponse tableServiceResponse = new TableServiceResponse();
|
||||
if (layoutParsingRequest.tablesFileStorageId().isPresent()) {
|
||||
tableServiceResponse = layoutParsingStorageService.getTablesFile(layoutParsingRequest.pageFileStorageId());
|
||||
}
|
||||
|
||||
Document documentGraph = parseLayout(layoutParsingRequest.layoutParsingType(), originDocument, imageServiceResponse, tableServiceResponse);
|
||||
int numberOfPages = originDocument.getNumberOfPages();
|
||||
originDocument.close();
|
||||
|
||||
layoutParsingStorageService.storeSectionGrid(layoutParsingRequest, sectionGridCreatorService.createSectionGrid(documentGraph));
|
||||
layoutParsingStorageService.storeDocumentData(layoutParsingRequest, DocumentDataMapper.toDocumentData(documentGraph));
|
||||
|
||||
if (layoutParsingRequest.layoutParsingType().equals(LayoutParsingType.TAAS)) {
|
||||
var researchDocumentData = TaasDocumentDataMapper.fromDocument(documentGraph);
|
||||
layoutParsingStorageService.storeResearchDocumentData(layoutParsingRequest, researchDocumentData);
|
||||
}
|
||||
|
||||
return LayoutParsingFinishedEvent.builder()
|
||||
.identifier(layoutParsingRequest.identifier())
|
||||
.numberOfPages(numberOfPages)
|
||||
.duration(System.currentTimeMillis() - start)
|
||||
.message(format("Layout parsing is finished and files have been saved with Ids:\n Structure: %s\nText: %s\nPositions: %s\nPageData: %s",
|
||||
layoutParsingRequest.structureFileStorageId(),
|
||||
layoutParsingRequest.textBlockFileStorageId(),
|
||||
layoutParsingRequest.positionBlockFileStorageId(),
|
||||
layoutParsingRequest.pageFileStorageId()))
|
||||
.build();
|
||||
}
|
||||
|
||||
|
||||
public Document parseLayout(LayoutParsingType layoutParsingType,
|
||||
PDDocument originDocument,
|
||||
ImageServiceResponse imageServiceResponse,
|
||||
TableServiceResponse tableServiceResponse) {
|
||||
|
||||
ClassificationDocument classificationDocument = pdfParsingService.parseDocument(layoutParsingType,
|
||||
originDocument,
|
||||
cvTableParsingAdapter.buildCvParsedTablesPerPage(tableServiceResponse),
|
||||
imageServiceResponseAdapter.buildClassifiedImagesPerPage(imageServiceResponse));
|
||||
|
||||
switch (layoutParsingType) {
|
||||
case TAAS -> taasClassificationService.classifyDocument(classificationDocument);
|
||||
case DOCUMINE -> docuMineClassificationService.classifyDocument(classificationDocument);
|
||||
case REDACT_MANAGER -> redactManagerClassificationService.classifyDocument(classificationDocument);
|
||||
}
|
||||
|
||||
sectionsBuilderService.buildSections(classificationDocument);
|
||||
|
||||
return DocumentGraphFactory.buildDocumentGraph(classificationDocument);
|
||||
}
|
||||
|
||||
|
||||
public Document parseLayoutWithTimer(LayoutParsingType layoutParsingType,
|
||||
PDDocument originDocument,
|
||||
ImageServiceResponse imageServiceResponse,
|
||||
TableServiceResponse tableServiceResponse) {
|
||||
|
||||
long start = System.currentTimeMillis();
|
||||
|
||||
ClassificationDocument classificationDocument = pdfParsingService.parseDocument(layoutParsingType, originDocument,
|
||||
cvTableParsingAdapter.buildCvParsedTablesPerPage(tableServiceResponse),
|
||||
imageServiceResponseAdapter.buildClassifiedImagesPerPage(imageServiceResponse));
|
||||
|
||||
System.out.printf("parsed %d ms", System.currentTimeMillis() - start);
|
||||
|
||||
start = System.currentTimeMillis();
|
||||
switch (layoutParsingType) {
|
||||
case TAAS -> taasClassificationService.classifyDocument(classificationDocument);
|
||||
case DOCUMINE -> docuMineClassificationService.classifyDocument(classificationDocument);
|
||||
case REDACT_MANAGER -> redactManagerClassificationService.classifyDocument(classificationDocument);
|
||||
}
|
||||
System.out.printf(", classified %d ms", System.currentTimeMillis() - start);
|
||||
|
||||
start = System.currentTimeMillis();
|
||||
sectionsBuilderService.buildSections(classificationDocument);
|
||||
System.out.printf(", sections built %d ms", System.currentTimeMillis() - start);
|
||||
|
||||
start = System.currentTimeMillis();
|
||||
Document document = DocumentGraphFactory.buildDocumentGraph(classificationDocument);
|
||||
System.out.printf(", graph constructed %d ms", System.currentTimeMillis() - start);
|
||||
return document;
|
||||
}
|
||||
|
||||
}
|
||||
@ -1,87 +0,0 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor;
|
||||
|
||||
import static java.lang.String.format;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
import org.apache.pdfbox.pdmodel.PDDocument;
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingFinishedEvent;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingRequest;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.adapter.CvTableParsingAdapter;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.adapter.ImageServiceResponseAdapter;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.adapter.model.image.ImageServiceResponse;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.adapter.model.table.TableServiceResponse;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.model.ClassificationDocument;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.service.ClassificationService;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.service.PdfParsingService;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.service.SectionsBuilderService;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.factory.DocumentGraphFactory;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.Document;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.mapper.DocumentDataMapper;
|
||||
|
||||
import lombok.RequiredArgsConstructor;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
|
||||
@Slf4j
|
||||
@Service
|
||||
@RequiredArgsConstructor
|
||||
public class LayoutParsingService {
|
||||
|
||||
private final ImageServiceResponseAdapter imageServiceResponseAdapter;
|
||||
private final CvTableParsingAdapter cvTableParsingAdapter;
|
||||
private final LayoutParsingStorageService layoutParsingStorageService;
|
||||
private final PdfParsingService pdfParsingService;
|
||||
private final ClassificationService classificationService;
|
||||
private final SectionsBuilderService sectionsBuilderService;
|
||||
|
||||
|
||||
public LayoutParsingFinishedEvent parseLayoutAndSaveFilesToStorage(LayoutParsingRequest layoutParsingRequest) throws IOException {
|
||||
|
||||
long start = System.currentTimeMillis();
|
||||
PDDocument originDocument = layoutParsingStorageService.getOriginFile(layoutParsingRequest.originFileStorageId());
|
||||
|
||||
ImageServiceResponse imageServiceResponse = new ImageServiceResponse();
|
||||
if (layoutParsingRequest.imagesFileStorageId().isPresent()) {
|
||||
imageServiceResponse = layoutParsingStorageService.getImagesFile(layoutParsingRequest.pageFileStorageId());
|
||||
}
|
||||
|
||||
TableServiceResponse tableServiceResponse = new TableServiceResponse();
|
||||
if (layoutParsingRequest.tablesFileStorageId().isPresent()) {
|
||||
tableServiceResponse = layoutParsingStorageService.getTablesFile(layoutParsingRequest.pageFileStorageId());
|
||||
}
|
||||
|
||||
Document documentGraph = parseLayout(originDocument, imageServiceResponse, tableServiceResponse);
|
||||
int numberOfPages = originDocument.getNumberOfPages();
|
||||
originDocument.close();
|
||||
|
||||
layoutParsingStorageService.storeDocumentData(layoutParsingRequest, DocumentDataMapper.toDocumentData(documentGraph));
|
||||
|
||||
return LayoutParsingFinishedEvent.builder()
|
||||
.identifier(layoutParsingRequest.identifier())
|
||||
.numberOfPages(numberOfPages)
|
||||
.duration(System.currentTimeMillis() - start)
|
||||
.message(format("Layout parsing is finished and files have been saved with Ids:\n Structure: %s\nText: %s\nPositions: %s\nPageData: %s",
|
||||
layoutParsingRequest.structureFileStorageId(),
|
||||
layoutParsingRequest.textBlockFileStorageId(),
|
||||
layoutParsingRequest.positionBlockFileStorageId(),
|
||||
layoutParsingRequest.pageFileStorageId()))
|
||||
.build();
|
||||
}
|
||||
|
||||
|
||||
public Document parseLayout(PDDocument originDocument, ImageServiceResponse imageServiceResponse, TableServiceResponse tableServiceResponse) {
|
||||
|
||||
ClassificationDocument classificationDocument = pdfParsingService.parseDocument(originDocument,
|
||||
cvTableParsingAdapter.buildCvParsedTablesPerPage(tableServiceResponse),
|
||||
imageServiceResponseAdapter.buildClassifiedImagesPerPage(imageServiceResponse));
|
||||
|
||||
classificationService.classifyDocument(classificationDocument);
|
||||
|
||||
sectionsBuilderService.buildSections(classificationDocument);
|
||||
|
||||
return DocumentGraphFactory.buildDocumentGraph(classificationDocument);
|
||||
}
|
||||
|
||||
}
|
||||
@ -13,12 +13,14 @@ import org.apache.pdfbox.pdmodel.PDDocument;
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.section.SectionGrid;
|
||||
import com.iqser.red.storage.commons.service.StorageService;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.AtomicPositionBlockData;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.AtomicTextBlockData;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.DocumentData;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.DocumentTreeData;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.PageData;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.AtomicPositionBlockData;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.AtomicTextBlockData;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentData;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentTreeData;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.PageData;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.taas.ResearchDocumentData;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingRequest;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.adapter.model.image.ImageServiceResponse;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.adapter.model.table.TableServiceResponse;
|
||||
@ -67,13 +69,24 @@ public class LayoutParsingStorageService {
|
||||
}
|
||||
|
||||
|
||||
public void storeDocumentData(LayoutParsingRequest layoutParsingRequest, DocumentData documentData) throws IOException {
|
||||
public void storeDocumentData(LayoutParsingRequest layoutParsingRequest, DocumentData documentData) {
|
||||
|
||||
storageService.storeJSONObject(TenantContext.getTenantId(), layoutParsingRequest.structureFileStorageId(), documentData.getDocumentTreeData());
|
||||
storageService.storeJSONObject(TenantContext.getTenantId(), layoutParsingRequest.textBlockFileStorageId(), documentData.getAtomicTextBlocks());
|
||||
storageService.storeJSONObject(TenantContext.getTenantId(), layoutParsingRequest.positionBlockFileStorageId(), documentData.getAtomicPositionBlocks());
|
||||
storageService.storeJSONObject(TenantContext.getTenantId(), layoutParsingRequest.pageFileStorageId(), documentData.getPages());
|
||||
}
|
||||
|
||||
|
||||
public void storeSectionGrid(LayoutParsingRequest layoutParsingRequest, SectionGrid sectionGrid) {
|
||||
|
||||
storageService.storeJSONObject(TenantContext.getTenantId(), layoutParsingRequest.sectionGridStorageId(), sectionGrid);
|
||||
}
|
||||
|
||||
|
||||
public void storeResearchDocumentData(LayoutParsingRequest layoutParsingRequest, ResearchDocumentData researchDocumentData) {
|
||||
|
||||
storageService.storeJSONObject(TenantContext.getTenantId(), layoutParsingRequest.researchDocumentStorageId(), researchDocumentData);
|
||||
}
|
||||
|
||||
|
||||
@ -86,9 +99,7 @@ public class LayoutParsingStorageService {
|
||||
AtomicPositionBlockData[] atomicPositionBlockData = storageService.readJSONObject(TenantContext.getTenantId(),
|
||||
layoutParsingRequest.positionBlockFileStorageId(),
|
||||
AtomicPositionBlockData[].class);
|
||||
DocumentTreeData tableOfContentsData = storageService.readJSONObject(TenantContext.getTenantId(),
|
||||
layoutParsingRequest.structureFileStorageId(),
|
||||
DocumentTreeData.class);
|
||||
DocumentTreeData tableOfContentsData = storageService.readJSONObject(TenantContext.getTenantId(), layoutParsingRequest.structureFileStorageId(), DocumentTreeData.class);
|
||||
|
||||
return DocumentData.builder()
|
||||
.documentTreeData(tableOfContentsData)
|
||||
|
||||
@ -10,8 +10,8 @@ import java.util.Map;
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.adapter.model.image.ImageServiceResponse;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.model.ClassificationPage;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.model.image.ClassifiedImage;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.image.ClassifiedImage;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.ImageType;
|
||||
|
||||
import lombok.RequiredArgsConstructor;
|
||||
|
||||
@ -1,8 +0,0 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.classification.model;
|
||||
|
||||
public enum Orientation {
|
||||
|
||||
NONE,
|
||||
LEFT,
|
||||
RIGHT
|
||||
}
|
||||
@ -1,56 +0,0 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.classification.utils;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
import java.nio.file.Files;
|
||||
|
||||
import lombok.experimental.UtilityClass;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
|
||||
@Slf4j
|
||||
@UtilityClass
|
||||
public class FileUtils {
|
||||
|
||||
public File createTempFile(String filenamePrefix, String filenameSuffix) throws IOException {
|
||||
|
||||
File tempFile = Files.createTempFile(filenamePrefix, filenameSuffix).toFile();
|
||||
setRWPermissionsOnlyForOwner(tempFile);
|
||||
|
||||
return tempFile;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Deletes a file; logs a message with the reason if the deletion fails.
|
||||
* This method is null-safe.
|
||||
*
|
||||
* @param file The file to delete. Can be null.
|
||||
*/
|
||||
public void deleteFile(File file) {
|
||||
|
||||
if (file != null) {
|
||||
try {
|
||||
Files.deleteIfExists(file.toPath());
|
||||
} catch (IOException ex) {
|
||||
log.warn("Could not delete file!", ex);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
// We don't need to check the results of the permission setters below,
|
||||
// since we're manipulating a file we created ourselves.
|
||||
@SuppressWarnings({"ResultOfMethodCallIgnored", "squid:S899"})
|
||||
private void setRWPermissionsOnlyForOwner(File tempFile) {
|
||||
|
||||
try {
|
||||
tempFile.setReadable(true, true);
|
||||
tempFile.setWritable(true, true);
|
||||
tempFile.setExecutable(false);
|
||||
} catch (SecurityException ex) {
|
||||
// This should never happen since we're creating a temp file ourselves.
|
||||
log.warn("Caught an exception during temp file creation. This should not happend. Check the code.", ex);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
@ -13,13 +13,13 @@ import java.util.Map;
|
||||
import java.util.NoSuchElementException;
|
||||
import java.util.Set;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.model.AbstractPageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.model.ClassificationDocument;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.model.ClassificationFooter;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.model.ClassificationHeader;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.model.ClassificationPage;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.model.image.ClassifiedImage;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.model.text.TextPageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationDocument;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationFooter;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationHeader;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.image.ClassifiedImage;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.graph.DocumentTree;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.Document;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.Footer;
|
||||
@ -81,8 +81,9 @@ public class DocumentGraphFactory {
|
||||
|
||||
page.getMainBody().add(node);
|
||||
|
||||
List<TextPageBlock> textBlocks = new ArrayList<>(textBlocksToMerge);
|
||||
List<TextPageBlock> textBlocks = new ArrayList<>();
|
||||
textBlocks.add(originalTextBlock);
|
||||
textBlocks.addAll(textBlocksToMerge);
|
||||
AtomicTextBlock textBlock = context.textBlockFactory.buildAtomicTextBlock(TextPositionOperations.mergeAndSortTextPositionSequenceByYThenX(textBlocks), node, context, page);
|
||||
List<Integer> treeId = context.documentTree.createNewChildEntryAndReturnId(parentNode, node);
|
||||
node.setLeafTextBlock(textBlock);
|
||||
|
||||
@ -4,6 +4,8 @@ import java.awt.geom.Rectangle2D;
|
||||
import java.util.Collections;
|
||||
import java.util.List;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.graph.Boundary;
|
||||
|
||||
import lombok.AccessLevel;
|
||||
import lombok.Builder;
|
||||
import lombok.Getter;
|
||||
@ -16,7 +18,9 @@ public class SearchTextWithTextPositionDto {
|
||||
|
||||
String searchText;
|
||||
List<Integer> lineBreaks;
|
||||
List<Integer> stringCoordsToPositionCoords;
|
||||
List<Integer> stringIdxToPositionIdx;
|
||||
List<Boundary> boldTextBoundaries;
|
||||
List<Boundary> italicTextBoundaries;
|
||||
List<Rectangle2D> positions;
|
||||
|
||||
|
||||
@ -26,7 +30,7 @@ public class SearchTextWithTextPositionDto {
|
||||
.searchText("")
|
||||
.lineBreaks(Collections.emptyList())
|
||||
.positions(Collections.emptyList())
|
||||
.stringCoordsToPositionCoords(Collections.emptyList())
|
||||
.stringIdxToPositionIdx(Collections.emptyList())
|
||||
.build();
|
||||
}
|
||||
|
||||
|
||||
@ -2,13 +2,15 @@ package com.knecon.fforesight.service.layoutparser.processor.factory;
|
||||
|
||||
import java.awt.geom.AffineTransform;
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.util.Collections;
|
||||
import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
import java.util.Objects;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.model.text.RedTextPosition;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.model.text.TextDirection;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.model.text.TextPositionSequence;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.RedTextPosition;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextDirection;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.graph.Boundary;
|
||||
|
||||
import lombok.experimental.UtilityClass;
|
||||
|
||||
@ -24,7 +26,7 @@ public class SearchTextWithTextPositionFactory {
|
||||
public final int MAX_HYPHEN_LINEBREAK_DISTANCE = 3;
|
||||
|
||||
|
||||
public SearchTextWithTextPositionDto buildSearchTextToTextPositionModel(List<TextPositionSequence> sequences) {
|
||||
public SearchTextWithTextPositionDto buildSearchTextToTextPositionDto(List<TextPositionSequence> sequences) {
|
||||
|
||||
if (sequences.isEmpty() || sequences.stream().allMatch(sequence -> sequence.getTextPositions().isEmpty())) {
|
||||
return SearchTextWithTextPositionDto.empty();
|
||||
@ -69,8 +71,10 @@ public class SearchTextWithTextPositionFactory {
|
||||
return SearchTextWithTextPositionDto.builder()
|
||||
.searchText(context.stringBuilder.toString())
|
||||
.lineBreaks(context.lineBreaksStringIdx)
|
||||
.stringCoordsToPositionCoords(context.stringIdxToPositionIdx)
|
||||
.stringIdxToPositionIdx(context.stringIdxToPositionIdx)
|
||||
.positions(positions)
|
||||
.boldTextBoundaries(mergeToBoundaries(context.boldTextsStringIdx))
|
||||
.italicTextBoundaries(mergeToBoundaries(context.italicTextStringIdx))
|
||||
.build();
|
||||
}
|
||||
|
||||
@ -82,6 +86,8 @@ public class SearchTextWithTextPositionFactory {
|
||||
// unicode characters with more than 16-bit encoding have a length > 1 in java strings
|
||||
for (int j = 0; j < currentTextPosition.getUnicode().length(); j++) {
|
||||
context.stringIdxToPositionIdx.add(context.positionIdx);
|
||||
addTextPositionWithFontType(currentTextPosition, "bold", context.boldTextsStringIdx, context.stringIdx);
|
||||
addTextPositionWithFontType(currentTextPosition, "italic", context.italicTextStringIdx, context.stringIdx);
|
||||
}
|
||||
context.stringIdx += currentTextPosition.getUnicode().length();
|
||||
}
|
||||
@ -103,6 +109,33 @@ public class SearchTextWithTextPositionFactory {
|
||||
return context.stringIdx - context.lastHyphenIdx < MAX_HYPHEN_LINEBREAK_DISTANCE;
|
||||
}
|
||||
|
||||
private static List<Boundary> mergeToBoundaries(List<Integer> integers) {
|
||||
|
||||
if (integers.isEmpty()) {
|
||||
return Collections.emptyList();
|
||||
}
|
||||
List<Boundary> boundaries = new LinkedList<>();
|
||||
int start = integers.get(0);
|
||||
int end = integers.get(0) + 1;
|
||||
for (int current : integers) {
|
||||
if (current > end + 1) {
|
||||
boundaries.add(new Boundary(start, end));
|
||||
start = current;
|
||||
}
|
||||
end = current + 1;
|
||||
}
|
||||
if (boundaries.isEmpty())
|
||||
boundaries.add(new Boundary(start, end));
|
||||
return boundaries;
|
||||
}
|
||||
|
||||
|
||||
private static void addTextPositionWithFontType(RedTextPosition currentTextPosition, String fontType, List<Integer> fontTypePositions, int stringIdx) {
|
||||
|
||||
if (currentTextPosition.getFontName().toLowerCase().contains(fontType)) {
|
||||
fontTypePositions.add(stringIdx);
|
||||
}
|
||||
}
|
||||
|
||||
private boolean isLineBreak(RedTextPosition currentTextPosition, RedTextPosition previousTextPosition) {
|
||||
|
||||
@ -173,6 +206,8 @@ public class SearchTextWithTextPositionFactory {
|
||||
|
||||
List<Integer> stringIdxToPositionIdx = new LinkedList<>();
|
||||
List<Integer> lineBreaksStringIdx = new LinkedList<>();
|
||||
List<Integer> boldTextsStringIdx = new LinkedList<>();
|
||||
List<Integer> italicTextStringIdx = new LinkedList<>();
|
||||
StringBuilder stringBuilder = new StringBuilder();
|
||||
|
||||
int stringIdx;
|
||||
|
||||
@ -10,10 +10,10 @@ import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.model.AbstractPageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.model.image.ClassifiedImage;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.model.table.TablePageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.model.text.TextPageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.image.ClassifiedImage;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.TablePageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.GenericSemanticNode;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.Page;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.Section;
|
||||
@ -80,7 +80,7 @@ public class SectionNodeFactory {
|
||||
remainingBlocks.removeAll(alreadyMerged);
|
||||
|
||||
if (abstractPageBlock instanceof TextPageBlock) {
|
||||
List<TextPageBlock> textBlocks = findTextBlocksWithSameClassificationAndAlignsY(abstractPageBlock, remainingBlocks);
|
||||
List<TextPageBlock> textBlocks = findTextBlocksWithSameClassificationAndAlignsY((TextPageBlock) abstractPageBlock, remainingBlocks);
|
||||
alreadyMerged.addAll(textBlocks);
|
||||
DocumentGraphFactory.addParagraphOrHeadline(section, (TextPageBlock) abstractPageBlock, context, textBlocks);
|
||||
} else if (abstractPageBlock instanceof TablePageBlock tablePageBlock) {
|
||||
@ -123,7 +123,7 @@ public class SectionNodeFactory {
|
||||
List<AbstractPageBlock> previousList = splitList.get(i - 1);
|
||||
AbstractPageBlock lastPageBlockInPreviousList = previousList.get(previousList.size() - 1);
|
||||
if (lastPageBlockInPreviousList.isHeadline()) {
|
||||
previousList.remove(i - 1);
|
||||
previousList.remove(previousList.size() - 1);
|
||||
splitList.get(i).add(0, lastPageBlockInPreviousList);
|
||||
}
|
||||
}
|
||||
@ -162,7 +162,7 @@ public class SectionNodeFactory {
|
||||
}
|
||||
|
||||
|
||||
private List<TextPageBlock> findTextBlocksWithSameClassificationAndAlignsY(AbstractPageBlock atc, List<AbstractPageBlock> pageBlocks) {
|
||||
private List<TextPageBlock> findTextBlocksWithSameClassificationAndAlignsY(TextPageBlock atc, List<AbstractPageBlock> pageBlocks) {
|
||||
|
||||
return pageBlocks.stream()
|
||||
.filter(abstractTextContainer -> !abstractTextContainer.equals(atc))
|
||||
@ -170,6 +170,7 @@ public class SectionNodeFactory {
|
||||
.filter(abstractTextContainer -> abstractTextContainer instanceof TextPageBlock)
|
||||
.filter(abstractTextContainer -> abstractTextContainer.intersectsY(atc))
|
||||
.map(abstractTextContainer -> (TextPageBlock) abstractTextContainer)
|
||||
.filter(abstractTextContainer -> abstractTextContainer.getDir() == atc.getDir())
|
||||
.toList();
|
||||
}
|
||||
|
||||
|
||||
@ -7,10 +7,10 @@ import java.util.List;
|
||||
import java.util.Set;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.model.AbstractPageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.model.table.Cell;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.model.table.TablePageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.model.text.TextPositionSequence;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.Cell;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.TablePageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.GenericSemanticNode;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.Page;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.SemanticNode;
|
||||
|
||||
@ -2,7 +2,7 @@ package com.knecon.fforesight.service.layoutparser.processor.factory;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.model.text.TextPositionSequence;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.Page;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.SemanticNode;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.graph.textblock.AtomicTextBlock;
|
||||
@ -26,12 +26,33 @@ public class TextBlockFactory {
|
||||
|
||||
public AtomicTextBlock buildAtomicTextBlock(List<TextPositionSequence> sequences, SemanticNode parent, Integer numberOnPage, Page page) {
|
||||
|
||||
SearchTextWithTextPositionDto searchTextWithTextPositionDto = SearchTextWithTextPositionFactory.buildSearchTextToTextPositionModel(sequences);
|
||||
SearchTextWithTextPositionDto searchTextWithTextPositionDto = SearchTextWithTextPositionFactory.buildSearchTextToTextPositionDto(sequences);
|
||||
int offset = stringOffset;
|
||||
stringOffset += searchTextWithTextPositionDto.getSearchText().length();
|
||||
long idx = textBlockIdx;
|
||||
textBlockIdx++;
|
||||
return AtomicTextBlock.fromSearchTextWithTextPositionDto(searchTextWithTextPositionDto, parent, offset, idx, numberOnPage, page);
|
||||
String orientation;
|
||||
int textDirection;
|
||||
if (sequences.isEmpty()) {
|
||||
orientation = null;
|
||||
textDirection = 0;
|
||||
} else {
|
||||
orientation = sequences.get(0).getDir().toString();
|
||||
textDirection = sequences.get(0).getRotation();
|
||||
}
|
||||
return AtomicTextBlock.fromSearchTextWithTextPosition(searchTextWithTextPositionDto.getSearchText(),
|
||||
searchTextWithTextPositionDto.getLineBreaks(),
|
||||
searchTextWithTextPositionDto.getBoldTextBoundaries(),
|
||||
searchTextWithTextPositionDto.getItalicTextBoundaries(),
|
||||
searchTextWithTextPositionDto.getPositions(),
|
||||
searchTextWithTextPositionDto.getStringIdxToPositionIdx(),
|
||||
idx,
|
||||
parent,
|
||||
numberOnPage,
|
||||
page,
|
||||
offset,
|
||||
orientation,
|
||||
textDirection);
|
||||
}
|
||||
|
||||
|
||||
|
||||
@ -5,8 +5,7 @@ import static java.lang.String.format;
|
||||
import java.util.Collection;
|
||||
import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.graph.textblock.TextBlock;
|
||||
import java.util.stream.IntStream;
|
||||
|
||||
import lombok.EqualsAndHashCode;
|
||||
import lombok.Setter;
|
||||
@ -109,6 +108,10 @@ public class Boundary implements Comparable<Boundary> {
|
||||
return splitBoundaries;
|
||||
}
|
||||
|
||||
public IntStream intStream() {
|
||||
|
||||
return IntStream.range(start, end);
|
||||
}
|
||||
|
||||
public static Boundary merge(Collection<Boundary> boundaries) {
|
||||
|
||||
@ -138,26 +141,4 @@ public class Boundary implements Comparable<Boundary> {
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* shrinks the boundary, such that textBlock.subSequence(boundary) returns a string without whitespaces.
|
||||
*
|
||||
* @param textBlock TextBlock to check whitespaces against
|
||||
* @return boundary
|
||||
*/
|
||||
public Boundary trim(TextBlock textBlock) {
|
||||
|
||||
int trimmedStart = this.start;
|
||||
while (Character.isWhitespace(textBlock.charAt(trimmedStart))) {
|
||||
trimmedStart++;
|
||||
}
|
||||
|
||||
int trimmedEnd = this.end;
|
||||
while (Character.isWhitespace(textBlock.charAt(trimmedEnd - 1))) {
|
||||
trimmedEnd--;
|
||||
}
|
||||
|
||||
return new Boundary(trimmedStart, Math.max(trimmedEnd, trimmedStart));
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@ -7,7 +7,7 @@ import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
import java.util.stream.Stream;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.NodeType;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeType;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.Document;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.GenericSemanticNode;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.SemanticNode;
|
||||
|
||||
@ -11,7 +11,7 @@ import java.util.stream.Collectors;
|
||||
import java.util.stream.Stream;
|
||||
|
||||
import com.amazonaws.services.kms.model.NotFoundException;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.NodeType;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeType;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.graph.DocumentTree;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.graph.entity.RedactionEntity;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.graph.textblock.TextBlock;
|
||||
|
||||
@ -4,7 +4,7 @@ import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.NodeType;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeType;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.graph.DocumentTree;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.graph.entity.RedactionEntity;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.graph.textblock.TextBlock;
|
||||
|
||||
@ -4,7 +4,7 @@ import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.NodeType;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeType;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.graph.DocumentTree;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.graph.entity.RedactionEntity;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.graph.textblock.TextBlock;
|
||||
|
||||
@ -4,7 +4,7 @@ import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.NodeType;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeType;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.graph.DocumentTree;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.graph.entity.RedactionEntity;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.graph.textblock.TextBlock;
|
||||
|
||||
@ -8,7 +8,7 @@ import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.NodeType;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeType;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.graph.DocumentTree;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.graph.entity.RedactionEntity;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.graph.textblock.TextBlock;
|
||||
|
||||
@ -5,7 +5,7 @@ import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.model.ClassificationPage;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.graph.entity.RedactionEntity;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.graph.textblock.TextBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.graph.textblock.TextBlockCollector;
|
||||
|
||||
@ -4,7 +4,7 @@ import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.NodeType;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeType;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.graph.DocumentTree;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.graph.entity.RedactionEntity;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.graph.textblock.TextBlock;
|
||||
|
||||
@ -4,7 +4,7 @@ import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.NodeType;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeType;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.graph.DocumentTree;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.graph.entity.RedactionEntity;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.graph.textblock.TextBlock;
|
||||
|
||||
@ -3,6 +3,7 @@ package com.knecon.fforesight.service.layoutparser.processor.graph.nodes;
|
||||
import static java.lang.String.format;
|
||||
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.util.Comparator;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
@ -10,14 +11,14 @@ import java.util.Set;
|
||||
import java.util.stream.Collectors;
|
||||
import java.util.stream.Stream;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.NodeType;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeType;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.graph.Boundary;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.graph.DocumentTree;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.graph.entity.EntityType;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.graph.entity.RedactionEntity;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.graph.textblock.AtomicTextBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.graph.textblock.TextBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.RectangleTransformations;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations;
|
||||
|
||||
public interface SemanticNode {
|
||||
|
||||
@ -59,6 +60,12 @@ public interface SemanticNode {
|
||||
}
|
||||
|
||||
|
||||
default Page getFirstPage() {
|
||||
|
||||
return getTextBlock().getPages().stream().min(Comparator.comparingInt(Page::getNumber)).orElseThrow(() -> new IllegalStateException("SemanticNode has no Page!"));
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Each AtomicTextBlock is assigned a page, so to get the pages for this boundary, it collects the PageNodes from each AtomicTextBlock belonging to this node's TextBlock.
|
||||
*
|
||||
@ -306,7 +313,6 @@ public interface SemanticNode {
|
||||
}
|
||||
|
||||
|
||||
|
||||
/**
|
||||
* This function is used during insertion of EntityNodes into the graph, it checks if the boundary of the RedactionEntity intersects or even contains the RedactionEntity.
|
||||
* It sets the fields accordingly and recursively calls this function on all its children.
|
||||
|
||||
@ -9,7 +9,7 @@ import java.util.Set;
|
||||
import java.util.stream.IntStream;
|
||||
import java.util.stream.Stream;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.NodeType;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeType;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.graph.DocumentTree;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.graph.entity.RedactionEntity;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.graph.textblock.TextBlock;
|
||||
|
||||
@ -7,7 +7,7 @@ import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.NodeType;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeType;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.graph.DocumentTree;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.graph.entity.RedactionEntity;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.graph.textblock.TextBlock;
|
||||
|
||||
@ -3,6 +3,7 @@ package com.knecon.fforesight.service.layoutparser.processor.graph.textblock;
|
||||
import static java.lang.String.format;
|
||||
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.Collection;
|
||||
import java.util.Collections;
|
||||
@ -10,9 +11,8 @@ import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.AtomicPositionBlockData;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.AtomicTextBlockData;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.factory.SearchTextWithTextPositionDto;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.AtomicPositionBlockData;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.AtomicTextBlockData;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.graph.Boundary;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.Page;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.SemanticNode;
|
||||
@ -38,11 +38,20 @@ public class AtomicTextBlock implements TextBlock {
|
||||
//string coordinates
|
||||
Boundary boundary;
|
||||
String searchText;
|
||||
List<Integer> lineBreaks;
|
||||
@Builder.Default
|
||||
List<Integer> lineBreaks = new ArrayList<>();
|
||||
@Builder.Default
|
||||
List<Boundary> boldTextBoundaries = new ArrayList<>();
|
||||
@Builder.Default
|
||||
List<Boundary> italicTextBoundaries = new ArrayList<>();
|
||||
String orientation;
|
||||
int textDirection;
|
||||
|
||||
//position coordinates
|
||||
List<Integer> stringIdxToPositionIdx;
|
||||
List<Rectangle2D> positions;
|
||||
@Builder.Default
|
||||
List<Integer> stringIdxToPositionIdx = new ArrayList<>();
|
||||
@Builder.Default
|
||||
List<Rectangle2D> positions = new ArrayList<>();
|
||||
|
||||
@EqualsAndHashCode.Exclude
|
||||
SemanticNode parent;
|
||||
@ -55,23 +64,34 @@ public class AtomicTextBlock implements TextBlock {
|
||||
}
|
||||
|
||||
|
||||
public static AtomicTextBlock fromSearchTextWithTextPositionDto(SearchTextWithTextPositionDto searchTextWithTextPositionDto,
|
||||
SemanticNode parent,
|
||||
int stringOffset,
|
||||
Long textBlockIdx,
|
||||
Integer numberOnPage,
|
||||
Page page) {
|
||||
public static AtomicTextBlock fromSearchTextWithTextPosition(String searchText,
|
||||
List<Integer> lineBreaks,
|
||||
List<Boundary> boldTextBoundaries,
|
||||
List<Boundary> italicTextBoundaries,
|
||||
List<Rectangle2D> positions,
|
||||
List<Integer> stringIdxToPositionIdx,
|
||||
long idx,
|
||||
SemanticNode parent,
|
||||
int numberOnPage,
|
||||
Page page,
|
||||
int offset,
|
||||
String orientation,
|
||||
int textDirection) {
|
||||
|
||||
return AtomicTextBlock.builder()
|
||||
.id(textBlockIdx)
|
||||
.id(idx)
|
||||
.parent(parent)
|
||||
.searchText(searchTextWithTextPositionDto.getSearchText())
|
||||
.searchText(searchText)
|
||||
.numberOnPage(numberOnPage)
|
||||
.page(page)
|
||||
.lineBreaks(searchTextWithTextPositionDto.getLineBreaks())
|
||||
.positions(searchTextWithTextPositionDto.getPositions())
|
||||
.stringIdxToPositionIdx(searchTextWithTextPositionDto.getStringCoordsToPositionCoords())
|
||||
.boundary(new Boundary(stringOffset, stringOffset + searchTextWithTextPositionDto.getSearchText().length()))
|
||||
.lineBreaks(lineBreaks)
|
||||
.boldTextBoundaries(boldTextBoundaries)
|
||||
.italicTextBoundaries(italicTextBoundaries)
|
||||
.positions(positions)
|
||||
.stringIdxToPositionIdx(stringIdxToPositionIdx)
|
||||
.boundary(new Boundary(offset, offset + searchText.length()))
|
||||
.textDirection(textDirection)
|
||||
.orientation(orientation)
|
||||
.build();
|
||||
}
|
||||
|
||||
@ -82,11 +102,8 @@ public class AtomicTextBlock implements TextBlock {
|
||||
.id(textBlockIdx)
|
||||
.boundary(new Boundary(stringOffset, stringOffset))
|
||||
.searchText("")
|
||||
.lineBreaks(Collections.emptyList())
|
||||
.page(page)
|
||||
.numberOnPage(numberOnPage)
|
||||
.stringIdxToPositionIdx(Collections.emptyList())
|
||||
.positions(Collections.emptyList())
|
||||
.parent(parent)
|
||||
.build();
|
||||
}
|
||||
@ -191,7 +208,7 @@ public class AtomicTextBlock implements TextBlock {
|
||||
List<Rectangle2D> rectanglesPerLine = stringBoundary.split(getAllLineBreaksInBoundary(stringBoundary))
|
||||
.stream()
|
||||
.map(this::getPositions)
|
||||
.map(RectangleTransformations::rectangleUnionWithGaps)
|
||||
.map(RectangleTransformations::rectangleBBoxWithGaps)
|
||||
.flatMap(Collection::stream)
|
||||
.toList();
|
||||
Map<Page, List<Rectangle2D>> rectanglePerLinePerPage = new HashMap<>();
|
||||
|
||||
@ -3,6 +3,7 @@ package com.knecon.fforesight.service.layoutparser.processor.graph.textblock;
|
||||
import static java.lang.String.format;
|
||||
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.util.Collection;
|
||||
import java.util.Collections;
|
||||
import java.util.HashMap;
|
||||
import java.util.LinkedList;
|
||||
@ -182,4 +183,38 @@ public class ConcatenatedTextBlock implements TextBlock {
|
||||
return getSearchText();
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public List<Boundary> getBoldTextBoundaries() {
|
||||
|
||||
return getAtomicTextBlocks().stream().map(AtomicTextBlock::getBoldTextBoundaries).flatMap(Collection::stream).toList();
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public List<Boundary> getItalicTextBoundaries() {
|
||||
|
||||
return getAtomicTextBlocks().stream().map(AtomicTextBlock::getItalicTextBoundaries).flatMap(Collection::stream).toList();
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public String getOrientation() {
|
||||
|
||||
if (atomicTextBlocks.isEmpty()) {
|
||||
return "";
|
||||
}
|
||||
return atomicTextBlocks.get(0).getOrientation();
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public int getTextDirection() {
|
||||
|
||||
if (atomicTextBlocks.isEmpty()) {
|
||||
return 0;
|
||||
}
|
||||
return atomicTextBlocks.get(0).getTextDirection();
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@ -21,6 +21,18 @@ public interface TextBlock extends CharSequence {
|
||||
List<AtomicTextBlock> getAtomicTextBlocks();
|
||||
|
||||
|
||||
List<Boundary> getBoldTextBoundaries();
|
||||
|
||||
|
||||
List<Boundary> getItalicTextBoundaries();
|
||||
|
||||
|
||||
String getOrientation();
|
||||
|
||||
|
||||
int getTextDirection();
|
||||
|
||||
|
||||
Boundary getBoundary();
|
||||
|
||||
|
||||
|
||||
@ -1,15 +1,15 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.mapper;
|
||||
package com.knecon.fforesight.service.layoutparser.processor.mapper.redaction;
|
||||
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.AtomicPositionBlockData;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.AtomicTextBlockData;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.DocumentData;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.PageData;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.DocumentTreeData;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.AtomicPositionBlockData;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.AtomicTextBlockData;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentData;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.PageData;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentTreeData;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.graph.DocumentTree;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.Document;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.Image;
|
||||
@ -1,4 +1,4 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.mapper;
|
||||
package com.knecon.fforesight.service.layoutparser.processor.mapper.redaction;
|
||||
|
||||
import java.util.Arrays;
|
||||
import java.util.HashSet;
|
||||
@ -7,11 +7,11 @@ import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.NoSuchElementException;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.AtomicPositionBlockData;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.AtomicTextBlockData;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.DocumentData;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.DocumentTreeData;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.PageData;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.AtomicPositionBlockData;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.AtomicTextBlockData;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentData;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentTreeData;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.PageData;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.graph.DocumentTree;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.Document;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.Footer;
|
||||
@ -1,4 +1,4 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.mapper;
|
||||
package com.knecon.fforesight.service.layoutparser.processor.mapper.redaction;
|
||||
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.util.Arrays;
|
||||
@ -10,7 +10,7 @@ import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.Image;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.ImageType;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.TableCell;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.Table;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.RectangleTransformations;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations;
|
||||
|
||||
public class PropertiesMapper {
|
||||
|
||||
@ -0,0 +1,132 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.mapper.taas;
|
||||
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.util.List;
|
||||
import java.util.Locale;
|
||||
import java.util.concurrent.atomic.AtomicInteger;
|
||||
import java.util.stream.IntStream;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeType;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.taas.ParagraphData;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.taas.Range;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.taas.ResearchDocumentData;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.taas.RowData;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.taas.StructureObject;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.taas.TableData;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.graph.Boundary;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.Document;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.Page;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.SemanticNode;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.Table;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.TableCell;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.graph.textblock.TextBlock;
|
||||
|
||||
public class TaasDocumentDataMapper {
|
||||
|
||||
public static ResearchDocumentData fromDocument(Document document) {
|
||||
|
||||
AtomicInteger structureObjectNumber = new AtomicInteger();
|
||||
List<StructureObject> structureObjects = document.streamAllSubNodes()
|
||||
.filter(node -> !node.getType().equals(NodeType.TABLE_CELL))
|
||||
.filter(node -> !node.getType().equals(NodeType.SECTION))
|
||||
.map(node -> {
|
||||
if (node.getType().equals(NodeType.TABLE)) {
|
||||
return TaasDocumentDataMapper.fromTableWithTableData((Table) node, structureObjectNumber.getAndIncrement());
|
||||
} else {
|
||||
return TaasDocumentDataMapper.fromSemanticNodeWithParagraphData(node, structureObjectNumber.getAndIncrement());
|
||||
}
|
||||
})
|
||||
.toList();
|
||||
return ResearchDocumentData.builder().structureObjects(structureObjects).build();
|
||||
}
|
||||
|
||||
|
||||
public static ParagraphData fromTextBlock(String classification, TextBlock textBlock) {
|
||||
|
||||
return ParagraphData.builder()
|
||||
.boldTextBoundaries(textBlock.getBoldTextBoundaries().stream().map(b -> new Range(b.start(), b.end())).toList())
|
||||
.italicTextBoundaries(textBlock.getItalicTextBoundaries().stream().map(b -> new Range(b.start(), b.end())).toList())
|
||||
.text(textBlock.getSearchText())
|
||||
.linebreaks(textBlock.getLineBreaks())
|
||||
.classification(classification)
|
||||
.orientation(textBlock.getOrientation())
|
||||
.textDirection(textBlock.getTextDirection())
|
||||
.build();
|
||||
}
|
||||
|
||||
|
||||
public static TableData fromTable(Table table) {
|
||||
|
||||
List<RowData> rowData = IntStream.range(0, table.getNumberOfRows())
|
||||
.boxed()
|
||||
.map(rowIdx -> table.streamRow(rowIdx).toList())
|
||||
.map(TaasDocumentDataMapper::fromTableCells)
|
||||
.toList();
|
||||
return new TableData(rowData, table.getNumberOfCols(), table.getNumberOfRows());
|
||||
}
|
||||
|
||||
|
||||
public static RowData fromTableCells(List<TableCell> tableCells) {
|
||||
|
||||
if (tableCells.isEmpty()) {
|
||||
throw new IllegalArgumentException("no table cells provided");
|
||||
}
|
||||
boolean header = tableCells.stream().allMatch(TableCell::isHeader);
|
||||
Page firstPage = tableCells.get(0).getFirstPage();
|
||||
Rectangle2D bBox = tableCells.stream().map(TableCell::getBBox).reduce((map1, map2) -> {
|
||||
map1.forEach((page, rectangle) -> map2.merge(page, rectangle, (rect1, rect2) -> rect1.createUnion(rect2).getBounds2D()));
|
||||
return map2;
|
||||
}).orElseThrow().get(firstPage);
|
||||
List<TextBlock> textBlocks = tableCells.stream().map(TableCell::getTextBlock).toList();
|
||||
return new RowData(header, textBlocks.stream().map(textBlock -> TaasDocumentDataMapper.fromTextBlock("table_cell", textBlock)).toList(), toFloatArray(bBox));
|
||||
}
|
||||
|
||||
|
||||
private static Range toRange(Boundary boundary) {
|
||||
|
||||
return new Range(boundary.start(), boundary.end());
|
||||
}
|
||||
|
||||
|
||||
private static List<Range> toRange(List<Boundary> boundary) {
|
||||
|
||||
return boundary.stream().map(TaasDocumentDataMapper::toRange).toList();
|
||||
}
|
||||
|
||||
|
||||
public static StructureObject fromSemanticNodeWithParagraphData(SemanticNode semanticNode, Integer structureObjectNumber) {
|
||||
|
||||
Page page = semanticNode.getFirstPage();
|
||||
Rectangle2D bBox = semanticNode.getBBox().get(page);
|
||||
return StructureObject.builder()
|
||||
.structureObjectNumber(structureObjectNumber)
|
||||
.boundingBox(toFloatArray(bBox))
|
||||
.stringOffset(semanticNode.getBoundary().start())
|
||||
.page(page.getNumber())
|
||||
.paragraph(TaasDocumentDataMapper.fromTextBlock(semanticNode.getType().toString().toLowerCase(Locale.ROOT), semanticNode.getTextBlock()))
|
||||
.table(null)
|
||||
.build();
|
||||
}
|
||||
|
||||
|
||||
public static StructureObject fromTableWithTableData(Table table, int structureObjectNumber) {
|
||||
|
||||
Page page = table.getFirstPage();
|
||||
Rectangle2D bBox = table.getBBox().get(page);
|
||||
return StructureObject.builder()
|
||||
.structureObjectNumber(structureObjectNumber)
|
||||
.boundingBox(toFloatArray(bBox))
|
||||
.stringOffset(table.getBoundary().start())
|
||||
.page(page.getNumber())
|
||||
.paragraph(null)
|
||||
.table(TaasDocumentDataMapper.fromTable(table))
|
||||
.build();
|
||||
}
|
||||
|
||||
|
||||
private static float[] toFloatArray(Rectangle2D bBox) {
|
||||
|
||||
return new float[]{(float) bBox.getX(), (float) bBox.getY(), (float) bBox.getWidth(), (float) bBox.getHeight()};
|
||||
}
|
||||
|
||||
}
|
||||
@ -1,8 +1,8 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.classification.model;
|
||||
package com.knecon.fforesight.service.layoutparser.processor.model;
|
||||
|
||||
import com.fasterxml.jackson.annotation.JsonIgnore;
|
||||
import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.Rectangle;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.model.text.TextPageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
|
||||
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Data;
|
||||
@ -1,11 +1,11 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.classification.model;
|
||||
package com.knecon.fforesight.service.layoutparser.processor.model;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.section.SectionGrid;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.model.text.StringFrequencyCounter;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.model.text.UnclassifiedText;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.StringFrequencyCounter;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.UnclassifiedText;
|
||||
|
||||
import lombok.Data;
|
||||
import lombok.NoArgsConstructor;
|
||||
@ -1,8 +1,8 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.classification.model;
|
||||
package com.knecon.fforesight.service.layoutparser.processor.model;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.model.text.TextPageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
|
||||
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Data;
|
||||
@ -1,8 +1,8 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.classification.model;
|
||||
package com.knecon.fforesight.service.layoutparser.processor.model;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.model.text.TextPageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
|
||||
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Data;
|
||||
@ -1,11 +1,11 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.classification.model;
|
||||
package com.knecon.fforesight.service.layoutparser.processor.model;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.Rectangle;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.model.image.ClassifiedImage;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.model.text.StringFrequencyCounter;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.image.ClassifiedImage;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.StringFrequencyCounter;
|
||||
|
||||
import lombok.Data;
|
||||
import lombok.NonNull;
|
||||
@ -1,10 +1,10 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.classification.model;
|
||||
package com.knecon.fforesight.service.layoutparser.processor.model;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.model.image.ClassifiedImage;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.model.table.TablePageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.image.ClassifiedImage;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.TablePageBlock;
|
||||
|
||||
import lombok.Data;
|
||||
import lombok.NoArgsConstructor;
|
||||
@ -1,4 +1,4 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.classification.model;
|
||||
package com.knecon.fforesight.service.layoutparser.processor.model;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collections;
|
||||
@ -0,0 +1,27 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.model;
|
||||
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
|
||||
import lombok.AccessLevel;
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Getter;
|
||||
import lombok.experimental.FieldDefaults;
|
||||
|
||||
@Getter
|
||||
@AllArgsConstructor
|
||||
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
|
||||
public class GapInformation {
|
||||
|
||||
List<List<Rectangle2D>> xGaps;
|
||||
List<List<Rectangle2D>> yGaps;
|
||||
|
||||
|
||||
public GapInformation() {
|
||||
|
||||
xGaps = new LinkedList<>();
|
||||
yGaps = new LinkedList<>();
|
||||
}
|
||||
|
||||
}
|
||||
@ -0,0 +1,23 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.model;
|
||||
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.util.List;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
|
||||
|
||||
import lombok.AccessLevel;
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Getter;
|
||||
import lombok.experimental.FieldDefaults;
|
||||
|
||||
@Getter
|
||||
@AllArgsConstructor
|
||||
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
|
||||
public class LineInformation {
|
||||
|
||||
List<Rectangle2D> lineBBox;
|
||||
List<List<TextPositionSequence>> sequencesByLines;
|
||||
List<List<Rectangle2D>> bBoxWithGapsByLines;
|
||||
List<List<List<TextPositionSequence>>> sequencesWithGapsByLines;
|
||||
|
||||
}
|
||||
@ -0,0 +1,8 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.model;
|
||||
|
||||
public enum Orientation {
|
||||
|
||||
NONE,
|
||||
LEFT,
|
||||
RIGHT
|
||||
}
|
||||
@ -1,4 +1,4 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.classification.model;
|
||||
package com.knecon.fforesight.service.layoutparser.processor.model;
|
||||
|
||||
public enum PageBlockType {
|
||||
H1,
|
||||
@ -0,0 +1,21 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.model;
|
||||
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.util.List;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
|
||||
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Builder;
|
||||
import lombok.Getter;
|
||||
|
||||
@Getter
|
||||
@Builder
|
||||
@AllArgsConstructor
|
||||
public class PageContents {
|
||||
|
||||
List<TextPositionSequence> sortedTextPositionSequences;
|
||||
Rectangle2D cropBox;
|
||||
Rectangle2D mediaBox;
|
||||
|
||||
}
|
||||
@ -0,0 +1,17 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.model;
|
||||
|
||||
import java.awt.geom.Rectangle2D;
|
||||
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Getter;
|
||||
|
||||
@Getter
|
||||
@AllArgsConstructor
|
||||
public class PageInformation {
|
||||
|
||||
PageContents pageContents;
|
||||
LineInformation lineInformation;
|
||||
Rectangle2D mainBodyTextFrame;
|
||||
GapInformation gapInformation;
|
||||
|
||||
}
|
||||
@ -0,0 +1,123 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.model;
|
||||
|
||||
import java.util.Collections;
|
||||
import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
import java.util.regex.Matcher;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
import lombok.AccessLevel;
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.experimental.FieldDefaults;
|
||||
|
||||
@AllArgsConstructor
|
||||
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
|
||||
public class SectionIdentifier {
|
||||
|
||||
static Pattern numericalIdentifierPattern = Pattern.compile("^[\\s]?(\\d+)[\\s.,;]?(\\d+)?[\\s.,;]?(\\d+)?[\\s.,;]?(\\d+)?");
|
||||
|
||||
private enum Format {
|
||||
EMPTY,
|
||||
NUMERICAL,
|
||||
DOCUMENT
|
||||
}
|
||||
|
||||
Format format;
|
||||
String identifierString;
|
||||
List<Integer> identifiers;
|
||||
boolean asChild;
|
||||
|
||||
|
||||
public static SectionIdentifier fromSearchText(String headline) {
|
||||
|
||||
if (headline == null || headline.isEmpty() || headline.isBlank()) {
|
||||
return SectionIdentifier.empty();
|
||||
}
|
||||
|
||||
Matcher numericalIdentifierMatcher = numericalIdentifierPattern.matcher(headline);
|
||||
if (numericalIdentifierMatcher.find()) {
|
||||
return buildNumericalSectionIdentifier(headline, numericalIdentifierMatcher);
|
||||
}
|
||||
// more formats here
|
||||
return SectionIdentifier.empty();
|
||||
}
|
||||
|
||||
|
||||
public static SectionIdentifier asChildOf(SectionIdentifier sectionIdentifier) {
|
||||
|
||||
return new SectionIdentifier(sectionIdentifier.format, sectionIdentifier.toString(), sectionIdentifier.identifiers, true);
|
||||
}
|
||||
|
||||
|
||||
public static SectionIdentifier document() {
|
||||
|
||||
return new SectionIdentifier(Format.DOCUMENT, "document", Collections.emptyList(), false);
|
||||
}
|
||||
|
||||
|
||||
public static SectionIdentifier empty() {
|
||||
|
||||
return new SectionIdentifier(Format.EMPTY, "empty", Collections.emptyList(), false);
|
||||
}
|
||||
|
||||
|
||||
private static SectionIdentifier buildNumericalSectionIdentifier(String headline, Matcher numericalIdentifierMatcher) {
|
||||
|
||||
String identifierString = headline.substring(numericalIdentifierMatcher.start(), numericalIdentifierMatcher.end());
|
||||
List<Integer> identifiers = new LinkedList<>();
|
||||
for (int i = 1; i <= 4; i++) {
|
||||
String numericalIdentifier = numericalIdentifierMatcher.group(i);
|
||||
if (numericalIdentifier == null || numericalIdentifier.equals("0") || numericalIdentifier.isEmpty() || numericalIdentifier.isBlank()) {
|
||||
break;
|
||||
}
|
||||
identifiers.add(Integer.parseInt(numericalIdentifier.trim()));
|
||||
}
|
||||
return new SectionIdentifier(Format.NUMERICAL, identifierString, identifiers.stream().toList(), false);
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Determines if the current section is the parent of the given section.
|
||||
*
|
||||
* @param sectionIdentifier The section identifier to compare against.
|
||||
* @return true if the current section is the parent of the given section, false otherwise.
|
||||
*/
|
||||
public boolean isParentOf(SectionIdentifier sectionIdentifier) {
|
||||
|
||||
if (this.format.equals(Format.EMPTY)) {
|
||||
return false;
|
||||
}
|
||||
if (this.format.equals(Format.DOCUMENT)) {
|
||||
return true;
|
||||
}
|
||||
if (!this.format.equals(sectionIdentifier.format)) {
|
||||
return false;
|
||||
}
|
||||
if (this.identifiers.size() >= sectionIdentifier.identifiers.size() && !(this.identifiers.size() == sectionIdentifier.identifiers.size() && sectionIdentifier.asChild)) {
|
||||
return false;
|
||||
}
|
||||
for (int i = 0; i < this.identifiers.size(); i++) {
|
||||
if (!this.identifiers.get(i).equals(sectionIdentifier.identifiers.get(i))) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
public boolean isChildOf(SectionIdentifier sectionIdentifier) {
|
||||
|
||||
if (this.format.equals(Format.DOCUMENT) || this.format.equals(Format.EMPTY)) {
|
||||
return false;
|
||||
}
|
||||
return sectionIdentifier.isParentOf(this);
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
|
||||
return identifierString;
|
||||
}
|
||||
|
||||
}
|
||||
@ -1,4 +1,4 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.classification.model.image;
|
||||
package com.knecon.fforesight.service.layoutparser.processor.model.image;
|
||||
|
||||
import java.awt.geom.Rectangle2D;
|
||||
|
||||
@ -1,13 +1,13 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.classification.model.table;
|
||||
package com.knecon.fforesight.service.layoutparser.processor.model.table;
|
||||
|
||||
import java.awt.geom.Point2D;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Iterator;
|
||||
import java.util.List;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.model.text.TextPageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.model.text.TextPositionSequence;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.utils.TextNormalizationUtilities;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.TextNormalizationUtilities;
|
||||
|
||||
import lombok.Data;
|
||||
import lombok.EqualsAndHashCode;
|
||||
@ -1,4 +1,4 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.classification.model.table;
|
||||
package com.knecon.fforesight.service.layoutparser.processor.model.table;
|
||||
|
||||
import lombok.RequiredArgsConstructor;
|
||||
import lombok.Value;
|
||||
@ -1,4 +1,4 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.classification.model.table;
|
||||
package com.knecon.fforesight.service.layoutparser.processor.model.table;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
@ -1,4 +1,4 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.classification.model.table;
|
||||
package com.knecon.fforesight.service.layoutparser.processor.model.table;
|
||||
|
||||
import java.awt.geom.Point2D;
|
||||
import java.awt.geom.Rectangle2D;
|
||||
@ -1,4 +1,4 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.classification.model.table;
|
||||
package com.knecon.fforesight.service.layoutparser.processor.model.table;
|
||||
|
||||
import java.awt.geom.Line2D;
|
||||
import java.awt.geom.Point2D;
|
||||
@ -11,8 +11,8 @@ import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.TreeMap;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.utils.CohenSutherlandClipping;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.utils.DoubleComparisons;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.CohenSutherlandClipping;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.DoubleComparisons;
|
||||
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
|
||||
@ -1,4 +1,4 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.classification.model.table;
|
||||
package com.knecon.fforesight.service.layoutparser.processor.model.table;
|
||||
|
||||
import java.awt.geom.Point2D;
|
||||
import java.util.ArrayList;
|
||||
@ -8,9 +8,9 @@ import java.util.List;
|
||||
import java.util.Set;
|
||||
import java.util.TreeMap;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.model.AbstractPageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.model.PageBlockType;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.model.text.TextPageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.PageBlockType;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
|
||||
|
||||
import lombok.Getter;
|
||||
import lombok.Setter;
|
||||
@ -1,4 +1,4 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.classification.model.text;
|
||||
package com.knecon.fforesight.service.layoutparser.processor.model.text;
|
||||
|
||||
import org.apache.pdfbox.text.TextPosition;
|
||||
import org.springframework.beans.BeanUtils;
|
||||
@ -1,8 +1,8 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.classification.model.text;
|
||||
package com.knecon.fforesight.service.layoutparser.processor.model.text;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.utils.TextNormalizationUtilities;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.TextNormalizationUtilities;
|
||||
|
||||
import lombok.Getter;
|
||||
|
||||
@ -1,4 +1,4 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.classification.model.text;
|
||||
package com.knecon.fforesight.service.layoutparser.processor.model.text;
|
||||
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Builder;
|
||||
@ -1,4 +1,4 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.classification.model.text;
|
||||
package com.knecon.fforesight.service.layoutparser.processor.model.text;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
@ -1,4 +1,4 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.classification.model.text;
|
||||
package com.knecon.fforesight.service.layoutparser.processor.model.text;
|
||||
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
@ -1,4 +1,4 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.classification.model.text;
|
||||
package com.knecon.fforesight.service.layoutparser.processor.model.text;
|
||||
|
||||
import com.fasterxml.jackson.annotation.JsonCreator;
|
||||
import com.fasterxml.jackson.annotation.JsonValue;
|
||||
@ -1,21 +1,28 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.classification.model.text;
|
||||
package com.knecon.fforesight.service.layoutparser.processor.model.text;
|
||||
|
||||
import static java.util.stream.Collectors.toSet;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Comparator;
|
||||
import java.util.List;
|
||||
|
||||
import com.fasterxml.jackson.annotation.JsonIgnore;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.model.AbstractPageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.model.PageBlockType;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.utils.TextNormalizationUtilities;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.FloatFrequencyCounter;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.PageBlockType;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.DoubleComparisons;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.TextNormalizationUtilities;
|
||||
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Builder;
|
||||
import lombok.Data;
|
||||
import lombok.EqualsAndHashCode;
|
||||
import lombok.NoArgsConstructor;
|
||||
|
||||
@EqualsAndHashCode(callSuper = true)
|
||||
@Data
|
||||
@AllArgsConstructor
|
||||
@Builder
|
||||
@Data
|
||||
@NoArgsConstructor
|
||||
public class TextPageBlock extends AbstractPageBlock {
|
||||
|
||||
@ -67,6 +74,64 @@ public class TextPageBlock extends AbstractPageBlock {
|
||||
return sequences.get(0).getPageWidth();
|
||||
}
|
||||
|
||||
public static TextPageBlock merge(List<TextPageBlock> textBlocksToMerge) {
|
||||
|
||||
List<TextPositionSequence> sequences = textBlocksToMerge.stream().map(TextPageBlock::getSequences).flatMap(java.util.Collection::stream).toList();
|
||||
sequences = new ArrayList<>(sequences);
|
||||
return fromTextPositionSequences(sequences);
|
||||
}
|
||||
|
||||
public static TextPageBlock fromTextPositionSequences(List<TextPositionSequence> wordBlockList) {
|
||||
|
||||
TextPageBlock textBlock = null;
|
||||
|
||||
FloatFrequencyCounter lineHeightFrequencyCounter = new FloatFrequencyCounter();
|
||||
FloatFrequencyCounter fontSizeFrequencyCounter = new FloatFrequencyCounter();
|
||||
FloatFrequencyCounter spaceFrequencyCounter = new FloatFrequencyCounter();
|
||||
StringFrequencyCounter fontFrequencyCounter = new StringFrequencyCounter();
|
||||
StringFrequencyCounter styleFrequencyCounter = new StringFrequencyCounter();
|
||||
|
||||
for (TextPositionSequence wordBlock : wordBlockList) {
|
||||
|
||||
lineHeightFrequencyCounter.add(wordBlock.getTextHeight());
|
||||
fontSizeFrequencyCounter.add(wordBlock.getFontSize());
|
||||
spaceFrequencyCounter.add(wordBlock.getSpaceWidth());
|
||||
fontFrequencyCounter.add(wordBlock.getFont());
|
||||
styleFrequencyCounter.add(wordBlock.getFontStyle());
|
||||
|
||||
if (textBlock == null) {
|
||||
textBlock = new TextPageBlock(wordBlock.getMinXDirAdj(),
|
||||
wordBlock.getMaxXDirAdj(),
|
||||
wordBlock.getMinYDirAdj(),
|
||||
wordBlock.getMaxYDirAdj(),
|
||||
wordBlockList,
|
||||
wordBlock.getRotation());
|
||||
} else {
|
||||
TextPageBlock spatialEntity = textBlock.union(wordBlock);
|
||||
textBlock.resize(spatialEntity.getMinX(), spatialEntity.getMinY(), spatialEntity.getWidth(), spatialEntity.getHeight());
|
||||
}
|
||||
}
|
||||
|
||||
if (textBlock != null) {
|
||||
textBlock.setMostPopularWordFont(fontFrequencyCounter.getMostPopular());
|
||||
textBlock.setMostPopularWordStyle(styleFrequencyCounter.getMostPopular());
|
||||
textBlock.setMostPopularWordFontSize(fontSizeFrequencyCounter.getMostPopular());
|
||||
textBlock.setMostPopularWordHeight(lineHeightFrequencyCounter.getMostPopular());
|
||||
textBlock.setMostPopularWordSpaceWidth(spaceFrequencyCounter.getMostPopular());
|
||||
textBlock.setHighestFontSize(fontSizeFrequencyCounter.getHighest());
|
||||
}
|
||||
|
||||
if (textBlock != null && textBlock.getSequences() != null && textBlock.getSequences()
|
||||
.stream()
|
||||
.map(t -> DoubleComparisons.round(t.getMinYDirAdj(), 3))
|
||||
.collect(toSet())
|
||||
.size() == 1) {
|
||||
textBlock.getSequences().sort(Comparator.comparing(TextPositionSequence::getMinXDirAdj));
|
||||
}
|
||||
return textBlock;
|
||||
}
|
||||
|
||||
|
||||
|
||||
/**
|
||||
* Returns the minX value in pdf coordinate system.
|
||||
@ -1,4 +1,4 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.classification.model.text;
|
||||
package com.knecon.fforesight.service.layoutparser.processor.model.text;
|
||||
|
||||
import java.awt.geom.AffineTransform;
|
||||
import java.awt.geom.Point2D;
|
||||
@ -8,8 +8,8 @@ import java.util.stream.Collectors;
|
||||
|
||||
import org.apache.pdfbox.text.TextPosition;
|
||||
|
||||
import com.dslplatform.json.JsonAttribute;
|
||||
import com.fasterxml.jackson.annotation.JsonIgnore;
|
||||
import com.fasterxml.jackson.annotation.JsonIgnoreProperties;
|
||||
import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.Point;
|
||||
import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.Rectangle;
|
||||
|
||||
@ -25,7 +25,6 @@ import lombok.extern.slf4j.Slf4j;
|
||||
@Builder
|
||||
@NoArgsConstructor
|
||||
@AllArgsConstructor
|
||||
@JsonIgnoreProperties({"empty"})
|
||||
public class TextPositionSequence implements CharSequence {
|
||||
|
||||
public static final int HEIGHT_PADDING = 2;
|
||||
@ -36,6 +35,7 @@ public class TextPositionSequence implements CharSequence {
|
||||
private int rotation;
|
||||
private float pageHeight;
|
||||
private float pageWidth;
|
||||
private boolean isParagraphStart;
|
||||
|
||||
|
||||
public TextPositionSequence(int page) {
|
||||
@ -44,7 +44,7 @@ public class TextPositionSequence implements CharSequence {
|
||||
}
|
||||
|
||||
|
||||
public TextPositionSequence(List<TextPosition> textPositions, int page) {
|
||||
public TextPositionSequence(List<TextPosition> textPositions, int page, boolean isParagraphStart) {
|
||||
|
||||
this.textPositions = textPositions.stream().map(RedTextPosition::fromTextPosition).collect(Collectors.toList());
|
||||
this.page = page;
|
||||
@ -52,6 +52,7 @@ public class TextPositionSequence implements CharSequence {
|
||||
this.rotation = textPositions.get(0).getRotation();
|
||||
this.pageHeight = textPositions.get(0).getPageHeight();
|
||||
this.pageWidth = textPositions.get(0).getPageWidth();
|
||||
this.isParagraphStart = isParagraphStart;
|
||||
}
|
||||
|
||||
|
||||
@ -141,6 +142,7 @@ public class TextPositionSequence implements CharSequence {
|
||||
* @return the text direction adjusted minX value
|
||||
*/
|
||||
@JsonIgnore
|
||||
@JsonAttribute(ignore = true)
|
||||
public float getMinXDirAdj() {
|
||||
|
||||
return textPositions.get(0).getXDirAdj();
|
||||
@ -155,6 +157,7 @@ public class TextPositionSequence implements CharSequence {
|
||||
* @return the text direction adjusted maxX value
|
||||
*/
|
||||
@JsonIgnore
|
||||
@JsonAttribute(ignore = true)
|
||||
public float getMaxXDirAdj() {
|
||||
|
||||
return textPositions.get(textPositions.size() - 1).getXDirAdj() + textPositions.get(textPositions.size() - 1).getWidthDirAdj() + HEIGHT_PADDING;
|
||||
@ -169,6 +172,7 @@ public class TextPositionSequence implements CharSequence {
|
||||
* @return the text direction adjusted minY value. The upper border of the bounding box of the word.
|
||||
*/
|
||||
@JsonIgnore
|
||||
@JsonAttribute(ignore = true)
|
||||
public float getMinYDirAdj() {
|
||||
|
||||
return textPositions.get(0).getYDirAdj() - getTextHeight();
|
||||
@ -183,6 +187,7 @@ public class TextPositionSequence implements CharSequence {
|
||||
* @return the text direction adjusted maxY value. The lower border of the bounding box of the word.
|
||||
*/
|
||||
@JsonIgnore
|
||||
@JsonAttribute(ignore = true)
|
||||
public float getMaxYDirAdj() {
|
||||
|
||||
return textPositions.get(0).getYDirAdj();
|
||||
@ -191,6 +196,7 @@ public class TextPositionSequence implements CharSequence {
|
||||
|
||||
|
||||
@JsonIgnore
|
||||
@JsonAttribute(ignore = true)
|
||||
public float getTextHeight() {
|
||||
|
||||
return textPositions.get(0).getHeightDir() + HEIGHT_PADDING;
|
||||
@ -198,6 +204,7 @@ public class TextPositionSequence implements CharSequence {
|
||||
|
||||
|
||||
@JsonIgnore
|
||||
@JsonAttribute(ignore = true)
|
||||
public float getHeight() {
|
||||
|
||||
return getMaxYDirAdj() - getMinYDirAdj();
|
||||
@ -205,6 +212,7 @@ public class TextPositionSequence implements CharSequence {
|
||||
|
||||
|
||||
@JsonIgnore
|
||||
@JsonAttribute(ignore = true)
|
||||
public float getWidth() {
|
||||
|
||||
return getMaxXDirAdj() - getMinXDirAdj();
|
||||
@ -212,6 +220,7 @@ public class TextPositionSequence implements CharSequence {
|
||||
|
||||
|
||||
@JsonIgnore
|
||||
@JsonAttribute(ignore = true)
|
||||
public String getFont() {
|
||||
|
||||
return textPositions.get(0).getFontName().toLowerCase().replaceAll(",bold", "").replaceAll(",italic", "");
|
||||
@ -219,6 +228,7 @@ public class TextPositionSequence implements CharSequence {
|
||||
|
||||
|
||||
@JsonIgnore
|
||||
@JsonAttribute(ignore = true)
|
||||
public String getFontStyle() {
|
||||
|
||||
String lowercaseFontName = textPositions.get(0).getFontName().toLowerCase();
|
||||
@ -237,6 +247,7 @@ public class TextPositionSequence implements CharSequence {
|
||||
|
||||
|
||||
@JsonIgnore
|
||||
@JsonAttribute(ignore = true)
|
||||
public float getFontSize() {
|
||||
|
||||
return textPositions.get(0).getFontSizeInPt();
|
||||
@ -244,6 +255,7 @@ public class TextPositionSequence implements CharSequence {
|
||||
|
||||
|
||||
@JsonIgnore
|
||||
@JsonAttribute(ignore = true)
|
||||
public float getSpaceWidth() {
|
||||
|
||||
return textPositions.get(0).getWidthOfSpace();
|
||||
@ -260,6 +272,7 @@ public class TextPositionSequence implements CharSequence {
|
||||
* @return bounding box of the word in Pdf Coordinate System
|
||||
*/
|
||||
@JsonIgnore
|
||||
@JsonAttribute(ignore = true)
|
||||
@SneakyThrows
|
||||
public Rectangle getRectangle() {
|
||||
|
||||
@ -299,3 +312,4 @@ public class TextPositionSequence implements CharSequence {
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@ -1,4 +1,4 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.classification.model.text;
|
||||
package com.knecon.fforesight.service.layoutparser.processor.model.text;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
@ -14,7 +14,7 @@
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package com.knecon.fforesight.service.layoutparser.processor.classification.parsing;
|
||||
package com.knecon.fforesight.service.layoutparser.processor.parsing;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
@ -76,7 +76,7 @@ import org.apache.pdfbox.util.Vector;
|
||||
* THIS CODE IS DELIBERATELY INCORRECT, USE PDFStreamEngine INSTEAD.
|
||||
*/
|
||||
@SuppressWarnings({"PMD", "checkstyle:all"})
|
||||
class LegacyPDFStreamEngine extends PDFStreamEngine {
|
||||
public class LegacyPDFStreamEngine extends PDFStreamEngine {
|
||||
|
||||
private static final Log LOG = LogFactory.getLog(LegacyPDFStreamEngine.class);
|
||||
|
||||
@ -126,7 +126,7 @@ class LegacyPDFStreamEngine extends PDFStreamEngine {
|
||||
* This will initialize and process the contents of the stream.
|
||||
*
|
||||
* @param page the page to process
|
||||
* @throws IOException if there is an error accessing the stream.
|
||||
* @throws java.io.IOException if there is an error accessing the stream.
|
||||
*/
|
||||
@Override
|
||||
public void processPage(PDPage page) throws IOException {
|
||||
@ -149,7 +149,7 @@ class LegacyPDFStreamEngine extends PDFStreamEngine {
|
||||
* written by Ben Litchfield for PDFStreamEngine.
|
||||
*/
|
||||
@Override
|
||||
protected void showGlyph(Matrix textRenderingMatrix, PDFont font, int code,Vector displacement) throws IOException {
|
||||
protected void showGlyph(Matrix textRenderingMatrix, PDFont font, int code, Vector displacement) throws IOException {
|
||||
//
|
||||
// legacy calculations which were previously in PDFStreamEngine
|
||||
//
|
||||
@ -165,7 +165,7 @@ class LegacyPDFStreamEngine extends PDFStreamEngine {
|
||||
|
||||
float displacementX = displacement.getX();
|
||||
// the sorting algorithm is based on the width of the character. As the displacement
|
||||
// for vertical characters doesn't provide any suitable value for it, we have to
|
||||
// for vertical characters doesn't provide any suitable value for it, we have to
|
||||
// calculate our own
|
||||
if (font.isVertical()) {
|
||||
displacementX = font.getWidth(code) / 1000;
|
||||
@ -382,3 +382,4 @@ class LegacyPDFStreamEngine extends PDFStreamEngine {
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@ -1,9 +1,9 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.classification.parsing;
|
||||
package com.knecon.fforesight.service.layoutparser.processor.parsing;
|
||||
|
||||
import java.awt.color.CMMException;
|
||||
import java.awt.geom.Point2D;
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Comparator;
|
||||
import java.util.List;
|
||||
|
||||
import org.apache.pdfbox.contentstream.operator.Operator;
|
||||
@ -32,14 +32,16 @@ import org.apache.pdfbox.cos.COSBase;
|
||||
import org.apache.pdfbox.cos.COSNumber;
|
||||
import org.apache.pdfbox.pdmodel.PDDocument;
|
||||
import org.apache.pdfbox.pdmodel.PDPage;
|
||||
import org.apache.pdfbox.pdmodel.graphics.color.PDColor;
|
||||
import org.apache.pdfbox.text.TextPosition;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.model.table.Ruling;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.model.text.RedTextPosition;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.model.text.TextPositionSequence;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.RedTextPosition;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
|
||||
|
||||
import lombok.Getter;
|
||||
import lombok.Setter;
|
||||
import lombok.SneakyThrows;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
|
||||
@Getter
|
||||
@ -195,8 +197,8 @@ public class PDFLinesTextStripper extends PDFTextStripper {
|
||||
private void addVisibleRulings(List<Ruling> path, boolean stroke) throws IOException {
|
||||
|
||||
try {
|
||||
if (stroke && !getGraphicsState().getStrokingColor().isPattern() && getGraphicsState().getStrokingColor()
|
||||
.toRGB() == 0 || !stroke && !getGraphicsState().getNonStrokingColor().isPattern() && getGraphicsState().getNonStrokingColor().toRGB() == 0) {
|
||||
if (stroke && !getGraphicsState().getStrokingColor().isPattern() && isBlack(getGraphicsState().getStrokingColor()) || //
|
||||
!stroke && !getGraphicsState().getNonStrokingColor().isPattern() && isBlack(getGraphicsState().getNonStrokingColor())) {
|
||||
rulings.addAll(path);
|
||||
}
|
||||
} catch (UnsupportedOperationException e) {
|
||||
@ -207,14 +209,27 @@ public class PDFLinesTextStripper extends PDFTextStripper {
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
private boolean isBlack(PDColor color) {
|
||||
|
||||
try {
|
||||
return color.toRGB() == 0;
|
||||
} catch (CMMException e) {
|
||||
// see https://github.com/haraldk/TwelveMonkeys/issues/124 or https://issues.apache.org/jira/browse/PDFBOX-3531
|
||||
// This is a quick and dirt hack
|
||||
// Happens for file 216.pdf
|
||||
log.debug(e.getMessage());
|
||||
return color.getComponents()[0] == 0 && color.getComponents()[1] == 0 && color.getComponents()[2] == 0 && color.getComponents()[1] == 1;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public void writeString(String text, List<TextPosition> textPositions) throws IOException {
|
||||
public void writeString(String text, List<TextPosition> textPositions, boolean isParagraphStart) throws IOException {
|
||||
|
||||
int startIndex = 0;
|
||||
RedTextPosition previous = null;
|
||||
|
||||
textPositions.sort(Comparator.comparing(TextPosition::getXDirAdj));
|
||||
|
||||
for (int i = 0; i <= textPositions.size() - 1; i++) {
|
||||
|
||||
if (!textPositionSequences.isEmpty()) {
|
||||
@ -250,7 +265,7 @@ public class PDFLinesTextStripper extends PDFTextStripper {
|
||||
if (!(sublist.isEmpty() || sublist.size() == 1 && (sublist.get(0).getUnicode().equals(" ") || sublist.get(0).getUnicode().equals("\u00A0") || sublist.get(0)
|
||||
.getUnicode()
|
||||
.equals("\t")))) {
|
||||
textPositionSequences.add(new TextPositionSequence(sublist, pageNumber));
|
||||
textPositionSequences.add(new TextPositionSequence(sublist, pageNumber, i == textPositions.size() - 1 && isParagraphStart));
|
||||
}
|
||||
startIndex = i;
|
||||
}
|
||||
@ -260,7 +275,7 @@ public class PDFLinesTextStripper extends PDFTextStripper {
|
||||
if (!(sublist.isEmpty() || sublist.size() == 1 && (sublist.get(0).getUnicode().equals(" ") || sublist.get(0).getUnicode().equals("\u00A0") || sublist.get(0)
|
||||
.getUnicode()
|
||||
.equals("\t")))) {
|
||||
textPositionSequences.add(new TextPositionSequence(sublist, pageNumber));
|
||||
textPositionSequences.add(new TextPositionSequence(sublist, pageNumber, i == textPositions.size() - 1 && isParagraphStart));
|
||||
}
|
||||
startIndex = i;
|
||||
}
|
||||
@ -276,11 +291,11 @@ public class PDFLinesTextStripper extends PDFTextStripper {
|
||||
// Remove false sequence ends (whitespaces)
|
||||
if (previous != null && sublist.get(0).getYDirAdj() == previous.getYDirAdj() && sublist.get(0)
|
||||
.getXDirAdj() - (previous.getXDirAdj() + previous.getWidthDirAdj()) < 0.01) {
|
||||
for (TextPosition textPosition : sublist) {
|
||||
textPositionSequences.get(textPositionSequences.size() - 1).add(textPosition);
|
||||
for (TextPosition t : sublist) {
|
||||
textPositionSequences.get(textPositionSequences.size() - 1).add(t);
|
||||
}
|
||||
} else {
|
||||
textPositionSequences.add(new TextPositionSequence(sublist, pageNumber));
|
||||
textPositionSequences.add(new TextPositionSequence(sublist, pageNumber, i == textPositions.size() - 1 && isParagraphStart));
|
||||
}
|
||||
}
|
||||
startIndex = i + 1;
|
||||
@ -303,7 +318,7 @@ public class PDFLinesTextStripper extends PDFTextStripper {
|
||||
textPositionSequences.get(textPositionSequences.size() - 1).add(t);
|
||||
}
|
||||
} else {
|
||||
textPositionSequences.add(new TextPositionSequence(sublist, pageNumber));
|
||||
textPositionSequences.add(new TextPositionSequence(sublist, pageNumber, isParagraphStart));
|
||||
}
|
||||
}
|
||||
super.writeString(text);
|
||||
@ -328,3 +343,4 @@ public class PDFLinesTextStripper extends PDFTextStripper {
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@ -14,7 +14,7 @@
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package com.knecon.fforesight.service.layoutparser.processor.classification.parsing;
|
||||
package com.knecon.fforesight.service.layoutparser.processor.parsing;
|
||||
|
||||
import java.io.BufferedInputStream;
|
||||
import java.io.IOException;
|
||||
@ -27,6 +27,7 @@ import java.text.Bidi;
|
||||
import java.text.Normalizer;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collections;
|
||||
import java.util.Comparator;
|
||||
import java.util.HashMap;
|
||||
import java.util.Iterator;
|
||||
import java.util.LinkedList;
|
||||
@ -240,10 +241,10 @@ public class PDFTextStripper extends LegacyPDFStreamEngine {
|
||||
document = doc;
|
||||
output = outputStream;
|
||||
if (getAddMoreFormatting()) {
|
||||
paragraphEnd = lineSeparator;
|
||||
paragraphEnd = "\n----ParagraphEnd----\n\n";
|
||||
pageStart = lineSeparator;
|
||||
articleStart = lineSeparator;
|
||||
articleEnd = lineSeparator;
|
||||
articleStart = "\n----ArticelStart----\n\n";
|
||||
articleEnd = "\n----ArticelEnd----\n\n";
|
||||
}
|
||||
startDocument(document);
|
||||
processPages(document.getPages());
|
||||
@ -594,9 +595,14 @@ public class PDFTextStripper extends LegacyPDFStreamEngine {
|
||||
// but this caused a lot of regression test failures. So, I'm leaving it be for
|
||||
// now
|
||||
if (!overlap(positionY, positionHeight, maxYForLine, maxHeightForLine)) {
|
||||
writeLine(normalize(line));
|
||||
line.clear();
|
||||
var normalized = normalize(line);
|
||||
// normalized.stream().filter(l -> System.out.println(l.getText().contains("Plenarprotokoll 20/24")).findFirst().isPresent()
|
||||
|
||||
|
||||
lastLineStartPosition = handleLineSeparation(current, lastPosition, lastLineStartPosition, maxHeightForLine);
|
||||
writeLine(normalized, current.isParagraphStart);
|
||||
line.clear();
|
||||
|
||||
expectedStartOfNextWordX = EXPECTED_START_OF_NEXT_WORD_X_RESET_VALUE;
|
||||
maxYForLine = MAX_Y_FOR_LINE_RESET_VALUE;
|
||||
maxHeightForLine = MAX_HEIGHT_FOR_LINE_RESET_VALUE;
|
||||
@ -630,7 +636,24 @@ public class PDFTextStripper extends LegacyPDFStreamEngine {
|
||||
if (startOfPage && lastPosition == null) {
|
||||
writeParagraphStart();// not sure this is correct for RTL?
|
||||
}
|
||||
|
||||
line.add(new LineItem(position));
|
||||
|
||||
// Collections.sort(line, new Comparator<LineItem>() {
|
||||
//
|
||||
// @Override
|
||||
// public int compare(LineItem str1, LineItem str2) {
|
||||
// if(null == str1.getTextPosition()) {
|
||||
// return 0;
|
||||
// }
|
||||
// else if(null == str2.getTextPosition()) {
|
||||
// return 0;
|
||||
// }
|
||||
// return Float.compare(str1.getTextPosition().getX(), str2.getTextPosition().getX());
|
||||
// }
|
||||
// });
|
||||
|
||||
// line.sort(Comparator.comparing(a -> a.getTextPosition() != null && a.getTextPosition().getX()));
|
||||
}
|
||||
maxHeightForLine = Math.max(maxHeightForLine, positionHeight);
|
||||
minYTopForLine = Math.min(minYTopForLine, positionY - positionHeight);
|
||||
@ -646,7 +669,7 @@ public class PDFTextStripper extends LegacyPDFStreamEngine {
|
||||
}
|
||||
// print the final line
|
||||
if (line.size() > 0) {
|
||||
writeLine(normalize(line));
|
||||
writeLine(normalize(line), false);
|
||||
writeParagraphEnd();
|
||||
}
|
||||
endArticle();
|
||||
@ -703,7 +726,7 @@ public class PDFTextStripper extends LegacyPDFStreamEngine {
|
||||
* @param textPositions The TextPositions belonging to the text.
|
||||
* @throws IOException If there is an error when writing the text.
|
||||
*/
|
||||
protected void writeString(String text, List<TextPosition> textPositions) throws IOException {
|
||||
protected void writeString(String text, List<TextPosition> textPositions, boolean isParagraphEnd) throws IOException {
|
||||
|
||||
writeString(text);
|
||||
}
|
||||
@ -998,7 +1021,7 @@ public class PDFTextStripper extends LegacyPDFStreamEngine {
|
||||
|
||||
|
||||
/**
|
||||
* By default the text stripper will attempt to remove text that overlapps each other. Word paints the same
|
||||
* By default, the text stripper will attempt to remove text that overlapps each other. Word paints the same
|
||||
* character several times in order to make it look bold. By setting this to false all text will be extracted, which
|
||||
* means that certain sections will be duplicated, but better performance will be noticed.
|
||||
*
|
||||
@ -1385,6 +1408,7 @@ public class PDFTextStripper extends LegacyPDFStreamEngine {
|
||||
} else {
|
||||
writeLineSeparator();
|
||||
writeParagraphSeparator();
|
||||
lastLineStartPosition.setEndParagraphWritten();
|
||||
}
|
||||
} else {
|
||||
writeLineSeparator();
|
||||
@ -1428,6 +1452,10 @@ public class PDFTextStripper extends LegacyPDFStreamEngine {
|
||||
float newXVal = multiplyFloat(getIndentThreshold(), position.getTextPosition().getWidthOfSpace());
|
||||
float positionWidth = multiplyFloat(0.25f, position.getTextPosition().getWidth());
|
||||
|
||||
// if(xGap < 0){
|
||||
// result = true;
|
||||
// }
|
||||
// else
|
||||
if (yGap > newYVal) {
|
||||
result = true;
|
||||
} else if (xGap > newXVal) {
|
||||
@ -1636,12 +1664,13 @@ public class PDFTextStripper extends LegacyPDFStreamEngine {
|
||||
* @param line a list with the words of the given line
|
||||
* @throws IOException if something went wrong
|
||||
*/
|
||||
private void writeLine(List<WordWithTextPositions> line) throws IOException {
|
||||
private void writeLine(List<WordWithTextPositions> line, boolean isParagraphEnd) throws IOException {
|
||||
|
||||
int numberOfStrings = line.size();
|
||||
for (int i = 0; i < numberOfStrings; i++) {
|
||||
WordWithTextPositions word = line.get(i);
|
||||
writeString(word.getText(), word.getTextPositions());
|
||||
word.getTextPositions().sort(Comparator.comparing(TextPosition::getX));
|
||||
writeString(word.getText(), word.getTextPositions(), isParagraphEnd && i == numberOfStrings - 1);
|
||||
if (i < numberOfStrings - 1) {
|
||||
writeWordSeparator();
|
||||
}
|
||||
@ -1963,6 +1992,8 @@ public class PDFTextStripper extends LegacyPDFStreamEngine {
|
||||
private boolean isHangingIndent = false;
|
||||
private boolean isArticleStart = false;
|
||||
|
||||
private boolean endParagraphWritten = false;
|
||||
|
||||
private TextPosition position = null;
|
||||
|
||||
|
||||
@ -2024,6 +2055,16 @@ public class PDFTextStripper extends LegacyPDFStreamEngine {
|
||||
}
|
||||
|
||||
|
||||
public boolean isEndParagraphWritten() {
|
||||
|
||||
return endParagraphWritten;
|
||||
}
|
||||
|
||||
public void setEndParagraphWritten(){
|
||||
endParagraphWritten = true;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Sets the isArticleStart() flag to true.
|
||||
*/
|
||||
@ -2065,3 +2106,4 @@ public class PDFTextStripper extends LegacyPDFStreamEngine {
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@ -10,11 +10,10 @@ import org.springframework.amqp.rabbit.annotation.RabbitListener;
|
||||
import org.springframework.amqp.rabbit.core.RabbitTemplate;
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import com.fasterxml.jackson.core.JsonProcessingException;
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingFinishedEvent;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingRequest;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.LayoutParsingService;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.LayoutParsingPipeline;
|
||||
|
||||
import lombok.RequiredArgsConstructor;
|
||||
import lombok.SneakyThrows;
|
||||
@ -25,7 +24,7 @@ import lombok.extern.slf4j.Slf4j;
|
||||
@RequiredArgsConstructor
|
||||
public class MessageHandler {
|
||||
|
||||
private final LayoutParsingService layoutParsingService;
|
||||
private final LayoutParsingPipeline layoutParsingPipeline;
|
||||
private final ObjectMapper objectMapper;
|
||||
private final RabbitTemplate rabbitTemplate;
|
||||
|
||||
@ -42,7 +41,7 @@ public class MessageHandler {
|
||||
throw new AmqpRejectAndDontRequeueException(String.format("Error during last layout parsing of request with identifier: %s, do not retry.",
|
||||
layoutParsingRequest.identifier()));
|
||||
}
|
||||
LayoutParsingFinishedEvent layoutParsingFinishedEvent = layoutParsingService.parseLayoutAndSaveFilesToStorage(layoutParsingRequest);
|
||||
LayoutParsingFinishedEvent layoutParsingFinishedEvent = layoutParsingPipeline.parseLayoutAndSaveFilesToStorage(layoutParsingRequest);
|
||||
sendLayoutParsingFinishedEvent(layoutParsingFinishedEvent);
|
||||
log.info("Layout parsing finished {} in {} ms", layoutParsingRequest.identifier(), layoutParsingFinishedEvent.duration());
|
||||
}
|
||||
@ -50,11 +49,7 @@ public class MessageHandler {
|
||||
|
||||
public void sendLayoutParsingFinishedEvent(LayoutParsingFinishedEvent layoutParsingFinishedEvent) {
|
||||
|
||||
try {
|
||||
rabbitTemplate.convertAndSend(LAYOUT_PARSING_FINISHED_EVENT_QUEUE, objectMapper.writeValueAsString(layoutParsingFinishedEvent));
|
||||
} catch (JsonProcessingException e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
rabbitTemplate.convertAndSend(LAYOUT_PARSING_FINISHED_EVENT_QUEUE, layoutParsingFinishedEvent);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@ -1,4 +1,4 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.classification.service;
|
||||
package com.knecon.fforesight.service.layoutparser.processor.services;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
@ -6,18 +6,18 @@ import org.springframework.stereotype.Service;
|
||||
|
||||
import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.Point;
|
||||
import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.Rectangle;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.model.AbstractPageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.model.ClassificationPage;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.model.FloatFrequencyCounter;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.model.table.Cell;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.model.table.TablePageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.model.text.TextPageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.utils.PositionUtils;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.FloatFrequencyCounter;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.Cell;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.TablePageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.PositionUtils;
|
||||
|
||||
@Service
|
||||
public class BodyTextFrameService {
|
||||
|
||||
private static final float APPROXIMATE_HEADER_LINE_COUNT = 2.9f;
|
||||
private static final float APPROXIMATE_HEADER_LINE_COUNT = 2.0f;
|
||||
|
||||
|
||||
/**
|
||||
@ -0,0 +1,87 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.services;
|
||||
|
||||
import java.awt.geom.Line2D;
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.util.ArrayList;
|
||||
import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
import java.util.stream.Stream;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.GapInformation;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.PageContents;
|
||||
|
||||
import lombok.experimental.UtilityClass;
|
||||
|
||||
@UtilityClass
|
||||
public class DividingColumnDetectionService {
|
||||
|
||||
private static final int MAX_NUMBER_OF_COLUMNS = 200;
|
||||
|
||||
private static final int LINE_COUNT_THRESHOLD = 5;
|
||||
|
||||
|
||||
public List<Rectangle2D> detectColumns(PageContents pageContents) {
|
||||
|
||||
|
||||
if (pageContents.getSortedTextPositionSequences().size() < 2) {
|
||||
return List.of(pageContents.getCropBox());
|
||||
}
|
||||
|
||||
GapInformation linesWithGapInformation = GapDetectionService.findGapsInLines(pageContents.getSortedTextPositionSequences(), pageContents.getCropBox());
|
||||
|
||||
return detectColumnsFromLines(linesWithGapInformation.getXGaps(), pageContents.getCropBox());
|
||||
}
|
||||
|
||||
|
||||
public List<Rectangle2D> detectColumnsFromLines(List<List<Rectangle2D>> gaps, Rectangle2D mainBodyTextFrame) {
|
||||
|
||||
List<List<Line2D>> allColumnParts = new ArrayList<>(MAX_NUMBER_OF_COLUMNS);
|
||||
for (int columnIndex = 1; columnIndex < MAX_NUMBER_OF_COLUMNS; columnIndex++) {
|
||||
double x = calculateGapLocation(columnIndex, MAX_NUMBER_OF_COLUMNS, mainBodyTextFrame.getWidth()) + mainBodyTextFrame.getMinX();
|
||||
double currentMinY = mainBodyTextFrame.getMaxY();
|
||||
double currentMaxY = 0;
|
||||
int currentLineCount = 0;
|
||||
List<Line2D> columnParts = new LinkedList<>();
|
||||
allColumnParts.add(columnParts);
|
||||
for (int lineNumber = 0; lineNumber < gaps.size(); lineNumber++) {
|
||||
List<Rectangle2D> textBlocksInLine = gaps.get(lineNumber);
|
||||
if (anyBlockIntersectX(textBlocksInLine, x)) {
|
||||
if (lineNumber == gaps.size() - 1) {
|
||||
currentMaxY = mainBodyTextFrame.getMinY();
|
||||
} else {
|
||||
currentMaxY = gaps.get(lineNumber + 1).get(0).getMinY();
|
||||
}
|
||||
currentLineCount++;
|
||||
} else {
|
||||
if (currentLineCount >= LINE_COUNT_THRESHOLD) {
|
||||
columnParts.add(new Line2D.Double(x, currentMinY, x, currentMaxY));
|
||||
}
|
||||
currentMinY = gaps.get(lineNumber).get(0).getMaxY();
|
||||
currentMaxY = currentMinY;
|
||||
currentLineCount = 0;
|
||||
}
|
||||
|
||||
}
|
||||
if (currentLineCount >= LINE_COUNT_THRESHOLD) {
|
||||
columnParts.add(new Line2D.Double(x, currentMinY, x, currentMaxY));
|
||||
}
|
||||
}
|
||||
return Stream.concat(Stream.of(mainBodyTextFrame),
|
||||
allColumnParts.stream()
|
||||
.flatMap(columnParts -> columnParts.stream().map(line -> new Rectangle2D.Double(line.getX2(), line.getY2(), 1, Math.abs(line.getY2() - line.getY1()))))
|
||||
.map(r -> (Rectangle2D) r)).toList();
|
||||
}
|
||||
|
||||
|
||||
private static boolean anyBlockIntersectX(List<Rectangle2D> textBlocksInLine, double x) {
|
||||
|
||||
return textBlocksInLine.stream().anyMatch(gap -> gap.getMinX() < x && x < gap.getMaxX());
|
||||
}
|
||||
|
||||
|
||||
private double calculateGapLocation(int columnIndex, int numberOfColumns, double pageWidth) {
|
||||
|
||||
return (pageWidth / numberOfColumns) * columnIndex;
|
||||
}
|
||||
|
||||
}
|
||||
@ -0,0 +1,163 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.services;
|
||||
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.GapInformation;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations;
|
||||
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.experimental.UtilityClass;
|
||||
|
||||
@UtilityClass
|
||||
public class GapDetectionService {
|
||||
|
||||
private static final double X_GAP_FACTOR = 1; // multiplied with average text height, determines the minimum distance of gaps in lines
|
||||
private static final double Y_GAP_FACTOR = 1;
|
||||
private static final double NEW_LINE_FACTOR = 0.2;
|
||||
|
||||
|
||||
public static GapInformation findGapsInLines(List<TextPositionSequence> sortedTextPositionSequences, Rectangle2D mainBodyTextFrame) {
|
||||
|
||||
if (sortedTextPositionSequences.isEmpty()) {
|
||||
return new GapInformation();
|
||||
}
|
||||
|
||||
final double avgTextPositionHeight = getAvgTextPositionHeight(sortedTextPositionSequences);
|
||||
|
||||
XGapsContext xGapContext = XGapsContext.init(mainBodyTextFrame);
|
||||
YGapsContext yGapContext = YGapsContext.init(mainBodyTextFrame);
|
||||
|
||||
var previousTextPosition = sortedTextPositionSequences.get(0);
|
||||
Rectangle2D rectangle = toRectangle2D(previousTextPosition);
|
||||
|
||||
xGapContext.addGapFromLeftEdgeOfMainBody(rectangle);
|
||||
|
||||
for (TextPositionSequence currentTextPosition : sortedTextPositionSequences.subList(1, sortedTextPositionSequences.size())) {
|
||||
|
||||
double yDifference = Math.abs(currentTextPosition.getMaxYDirAdj() - previousTextPosition.getMaxYDirAdj());
|
||||
double xGap = Math.abs(previousTextPosition.getMaxXDirAdj() - currentTextPosition.getMinXDirAdj());
|
||||
Rectangle2D previousTextPositionBBox = toRectangle2D(previousTextPosition);
|
||||
Rectangle2D currentTextPositionBBox = toRectangle2D(currentTextPosition);
|
||||
|
||||
if (yDifference > avgTextPositionHeight * Y_GAP_FACTOR) {
|
||||
yGapContext.addGap(mainBodyTextFrame.getMinX(),
|
||||
previousTextPositionBBox.getMaxY(),
|
||||
mainBodyTextFrame.getWidth(),
|
||||
-(previousTextPositionBBox.getMaxY() - currentTextPositionBBox.getMinY()));
|
||||
}
|
||||
if (yDifference > avgTextPositionHeight * NEW_LINE_FACTOR) {
|
||||
|
||||
xGapContext.addGapToRightEdgeOfMainBody(previousTextPositionBBox);
|
||||
xGapContext.gapsInCurrentLine = new LinkedList<>();
|
||||
xGapContext.gapsPerLine.add(xGapContext.gapsInCurrentLine);
|
||||
xGapContext.addGapFromLeftEdgeOfMainBody(currentTextPositionBBox);
|
||||
|
||||
} else if (xGap > avgTextPositionHeight * X_GAP_FACTOR) {
|
||||
addGapToLine(currentTextPositionBBox, previousTextPositionBBox, xGapContext);
|
||||
}
|
||||
previousTextPosition = currentTextPosition;
|
||||
}
|
||||
xGapContext.addGapToRightEdgeOfMainBody(toRectangle2D(sortedTextPositionSequences.get(sortedTextPositionSequences.size() - 1)));
|
||||
xGapContext.gapsPerLine.add(xGapContext.gapsInCurrentLine);
|
||||
|
||||
return new GapInformation(xGapContext.gapsPerLine, yGapContext.gapsPerLine);
|
||||
}
|
||||
|
||||
|
||||
private static Rectangle2D toRectangle2D(TextPositionSequence textPosition) {
|
||||
|
||||
return mirrorY(RectangleTransformations.toRectangle2D(textPosition.getRectangle()));
|
||||
}
|
||||
|
||||
private static Rectangle2D mirrorY(Rectangle2D rectangle2D) {
|
||||
|
||||
return new Rectangle2D.Double(rectangle2D.getX(), Math.min(rectangle2D.getMinY(), rectangle2D.getMaxY()), rectangle2D.getWidth(), Math.abs(rectangle2D.getHeight()));
|
||||
}
|
||||
|
||||
private static void addGapToLine(Rectangle2D currentTextPosition, Rectangle2D previousTextPosition, XGapsContext context) {
|
||||
|
||||
context.gapsInCurrentLine.add(new Rectangle2D.Double(previousTextPosition.getMaxX(),
|
||||
previousTextPosition.getMinY(),
|
||||
currentTextPosition.getMinX() - previousTextPosition.getMaxX(),
|
||||
(previousTextPosition.getHeight() + currentTextPosition.getHeight()) / 2));
|
||||
}
|
||||
|
||||
|
||||
private static void assertAllTextPositionsHaveSameDir(List<TextPositionSequence> textPositionSequences) {
|
||||
|
||||
assert textPositionSequences.stream().map(TextPositionSequence::getDir).allMatch(a -> a.equals(textPositionSequences.get(0).getDir()));
|
||||
}
|
||||
|
||||
|
||||
private static double getAvgTextPositionHeight(List<TextPositionSequence> textPositionSequences) {
|
||||
|
||||
return textPositionSequences.stream().mapToDouble(TextPositionSequence::getHeight).average().orElseThrow();
|
||||
}
|
||||
|
||||
|
||||
@AllArgsConstructor
|
||||
private static class YGapsContext {
|
||||
|
||||
List<List<Rectangle2D>> gapsPerLine;
|
||||
List<Rectangle2D> gapsInCurrentLine;
|
||||
Rectangle2D mainBodyTextFrame;
|
||||
|
||||
|
||||
public static YGapsContext init(Rectangle2D mainBodyTextFrame) {
|
||||
|
||||
List<List<Rectangle2D>> initialLinesWithGaps = new LinkedList<>();
|
||||
List<Rectangle2D> initialBlocksInLine = new LinkedList<>();
|
||||
initialLinesWithGaps.add(initialBlocksInLine);
|
||||
return new YGapsContext(initialLinesWithGaps, initialBlocksInLine, mainBodyTextFrame);
|
||||
}
|
||||
|
||||
|
||||
public void addGap(double x1, double y1, double w, double h) {
|
||||
|
||||
gapsInCurrentLine.add(new Rectangle2D.Double(x1, y1, w, h));
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@AllArgsConstructor
|
||||
private static class XGapsContext {
|
||||
|
||||
List<List<Rectangle2D>> gapsPerLine;
|
||||
List<Rectangle2D> gapsInCurrentLine;
|
||||
Rectangle2D mainBodyTextFrame;
|
||||
|
||||
|
||||
public static XGapsContext init(Rectangle2D mainBodyTextFrame) {
|
||||
|
||||
List<List<Rectangle2D>> initialLinesWithGaps = new LinkedList<>();
|
||||
List<Rectangle2D> initialBlocksInLine = new LinkedList<>();
|
||||
initialLinesWithGaps.add(initialBlocksInLine);
|
||||
return new XGapsContext(initialLinesWithGaps, initialBlocksInLine, mainBodyTextFrame);
|
||||
}
|
||||
|
||||
|
||||
public void addGapToRightEdgeOfMainBody(Rectangle2D textPosition) {
|
||||
|
||||
Rectangle2D leftGap = new Rectangle2D.Double(textPosition.getMaxX(),
|
||||
textPosition.getMinY(),
|
||||
mainBodyTextFrame.getMaxX() - textPosition.getMaxX(),
|
||||
textPosition.getHeight());
|
||||
gapsInCurrentLine.add(leftGap);
|
||||
}
|
||||
|
||||
|
||||
public void addGapFromLeftEdgeOfMainBody(Rectangle2D textPosition) {
|
||||
|
||||
Rectangle2D leftGap = new Rectangle2D.Double(mainBodyTextFrame.getMinX(),
|
||||
textPosition.getMinY(),
|
||||
textPosition.getMinX() - mainBodyTextFrame.getMinX(),
|
||||
textPosition.getHeight());
|
||||
gapsInCurrentLine.add(leftGap);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
@ -0,0 +1,201 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.services;
|
||||
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.awt.geom.RectangularShape;
|
||||
import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
import java.util.Queue;
|
||||
import java.util.stream.Stream;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.GapInformation;
|
||||
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Getter;
|
||||
import lombok.RequiredArgsConstructor;
|
||||
import lombok.experimental.UtilityClass;
|
||||
|
||||
@UtilityClass
|
||||
public class GapsAcrossLinesService {
|
||||
|
||||
private static final double GAP_WIDTH_THRESHOLD_FACTOR = 0.01; // multiplied with avg text height
|
||||
private static final double LINE_COUNT_THRESHOLD_FACTOR = 0.3; // multiplied with average line count per page
|
||||
private static final double DISTANCE_TO_BORDER_THRESHOLD = 1;
|
||||
|
||||
|
||||
public List<Rectangle2D> detectXGapsAcrossLines(GapInformation gapInformation, Rectangle2D mainBodyTextFrame) {
|
||||
|
||||
if (gapInformation.getXGaps().size() < 2) {
|
||||
return List.of(mainBodyTextFrame);
|
||||
}
|
||||
double avgHeight = gapInformation.getXGaps()
|
||||
.stream()
|
||||
.filter(gaps -> !gaps.isEmpty())
|
||||
.map(gaps -> gaps.get(0))
|
||||
.mapToDouble(RectangularShape::getHeight)
|
||||
.average()
|
||||
.orElseThrow();
|
||||
|
||||
ColumnFactory columnFactory = ColumnFactory.init(avgHeight, gapInformation.getXGaps().size());
|
||||
gapInformation.getXGaps().get(0).stream().map(GapAcrossLines::new).forEach(columnFactory::addToQueue);
|
||||
List<List<Rectangle2D>> xGaps = gapInformation.getXGaps();
|
||||
for (var gaps : xGaps.subList(1, xGaps.size())) {
|
||||
|
||||
while (columnFactory.hasGapsToProcess()) {
|
||||
GapAcrossLines gapAcrossLines = columnFactory.getNext();
|
||||
rememberColumnIfValid(columnFactory, gapAcrossLines);
|
||||
elongateColumnsAndFilterForWidth(gapAcrossLines, gaps, columnFactory).forEach(columnFactory::setToStillInProgress);
|
||||
}
|
||||
columnFactory.addStillInProgressToQueue();
|
||||
columnFactory.addGapsToQueue(gaps);
|
||||
}
|
||||
|
||||
return columnFactory.outputGaps.stream()
|
||||
.filter(gapAcrossLines -> columnFactory.outputGaps.stream().filter(gapAcrossLines::intersectsX).noneMatch(gapAcrossLines1 -> gapAcrossLines1.lineCount > gapAcrossLines.lineCount))
|
||||
.filter(gapAcrossLines -> Math.abs(gapAcrossLines.rectangle2D.getMinX() - mainBodyTextFrame.getMinX()) > DISTANCE_TO_BORDER_THRESHOLD)
|
||||
.filter(gapAcrossLines -> Math.abs(gapAcrossLines.rectangle2D.getMaxX() - mainBodyTextFrame.getMaxX()) > DISTANCE_TO_BORDER_THRESHOLD)
|
||||
.map(GapAcrossLines::getRectangle2D)
|
||||
.toList();
|
||||
}
|
||||
|
||||
|
||||
private static void rememberColumnIfValid(ColumnFactory columnFactory, GapAcrossLines gapAcrossLines) {
|
||||
|
||||
if (gapAcrossLines.lineCount >= LINE_COUNT_THRESHOLD_FACTOR * (double) columnFactory.lineCount) {
|
||||
columnFactory.outputGaps.add(gapAcrossLines);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
private static Stream<GapAcrossLines> elongateColumnsAndFilterForWidth(GapAcrossLines gapAcrossLines, List<Rectangle2D> gaps, ColumnFactory columnFactory) {
|
||||
|
||||
return gaps.stream()//
|
||||
.filter(gap -> gapAcrossLines.getIntersectionWidth(gap) > GAP_WIDTH_THRESHOLD_FACTOR * columnFactory.avgHeight)//
|
||||
.map(gapAcrossLines::addNewLineAndShrink);
|
||||
|
||||
}
|
||||
|
||||
|
||||
private static Rectangle2D correctRectangle(Rectangle2D rectangle2D) {
|
||||
|
||||
double minX = Math.min(rectangle2D.getMinX(), rectangle2D.getMaxX());
|
||||
double minY = Math.min(rectangle2D.getMinY(), rectangle2D.getMaxY());
|
||||
double maxX = Math.max(rectangle2D.getMinX(), rectangle2D.getMaxX());
|
||||
double maxY = Math.max(rectangle2D.getMinY(), rectangle2D.getMaxY());
|
||||
return new Rectangle2D.Double(minX, minY, maxX - minX, maxY - minY);
|
||||
}
|
||||
|
||||
|
||||
@Getter
|
||||
@AllArgsConstructor
|
||||
private class GapAcrossLines {
|
||||
|
||||
Rectangle2D rectangle2D;
|
||||
int lineCount = 1;
|
||||
|
||||
|
||||
public GapAcrossLines(Rectangle2D rectangle2D) {
|
||||
|
||||
this.rectangle2D = correctRectangle(rectangle2D);
|
||||
}
|
||||
|
||||
|
||||
public boolean intersectsX(Rectangle2D rectangle2D) {
|
||||
|
||||
return rectangle2D.getMinX() < this.rectangle2D.getMaxX() && this.rectangle2D.getMinX() < rectangle2D.getMaxX();
|
||||
}
|
||||
|
||||
|
||||
public boolean intersectsX(GapAcrossLines gapAcrossLines) {
|
||||
|
||||
return this.intersectsX(gapAcrossLines.getRectangle2D());
|
||||
}
|
||||
|
||||
|
||||
public double getIntersectionWidth(Rectangle2D rectangle2D) {
|
||||
|
||||
if (!intersectsX(rectangle2D)) {
|
||||
return -1;
|
||||
}
|
||||
double min_x = Math.max(rectangle2D.getMinX(), this.rectangle2D.getMinX());
|
||||
double max_x = Math.min(rectangle2D.getMaxX(), this.rectangle2D.getMaxX());
|
||||
return max_x - min_x;
|
||||
}
|
||||
|
||||
|
||||
public GapAcrossLines addNewLineAndShrink(Rectangle2D rectangle2D) {
|
||||
|
||||
var correctedRectangle = correctRectangle(rectangle2D);
|
||||
double min_x = Math.max(correctedRectangle.getMinX(), this.rectangle2D.getMinX());
|
||||
double max_x = Math.min(correctedRectangle.getMaxX(), this.rectangle2D.getMaxX());
|
||||
double min_y = correctedRectangle.getMinY();
|
||||
double max_y = this.rectangle2D.getMaxY();
|
||||
double width = max_x - min_x;
|
||||
double height = max_y - min_y;
|
||||
return new GapAcrossLines(new Rectangle2D.Double(min_x, min_y, width, height), lineCount + 1);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@RequiredArgsConstructor
|
||||
private class ColumnFactory {
|
||||
|
||||
final double avgHeight;
|
||||
final int lineCount;
|
||||
|
||||
List<GapAcrossLines> outputGaps = new LinkedList<>();
|
||||
Queue<GapAcrossLines> gapsQueue = new LinkedList<>();
|
||||
List<GapAcrossLines> gapsToQueue = new LinkedList<>();
|
||||
|
||||
|
||||
public static ColumnFactory init(double avgHeight, int lineCount) {
|
||||
|
||||
return new ColumnFactory(Math.abs(avgHeight), lineCount);
|
||||
}
|
||||
|
||||
|
||||
public GapAcrossLines getNext() {
|
||||
|
||||
return gapsQueue.remove();
|
||||
}
|
||||
|
||||
|
||||
public void addToQueue(GapAcrossLines gapAcrossLines) {
|
||||
|
||||
gapsQueue.add(gapAcrossLines);
|
||||
}
|
||||
|
||||
|
||||
public void addToQueue(Rectangle2D gap) {
|
||||
|
||||
gapsQueue.add(new GapAcrossLines(gap));
|
||||
}
|
||||
|
||||
|
||||
private boolean hasGapsToProcess() {
|
||||
|
||||
return gapsQueue.peek() != null;
|
||||
}
|
||||
|
||||
|
||||
public void setToStillInProgress(GapAcrossLines gapAcrossLines) {
|
||||
|
||||
gapsToQueue.add(gapAcrossLines);
|
||||
}
|
||||
|
||||
|
||||
private void addStillInProgressToQueue() {
|
||||
|
||||
for (int i = gapsToQueue.size() - 1; i >= 0; i--) {
|
||||
gapsQueue.add(gapsToQueue.remove(i));
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
public void addGapsToQueue(List<Rectangle2D> gaps) {
|
||||
|
||||
gaps.forEach(this::addToQueue);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
@ -0,0 +1,63 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.services;
|
||||
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.awt.geom.RectangularShape;
|
||||
import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.GapInformation;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.LineInformation;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
|
||||
|
||||
import lombok.experimental.UtilityClass;
|
||||
|
||||
@UtilityClass
|
||||
public class InvisibleTableDetectionService {
|
||||
|
||||
public List<List<Rectangle2D>> detectTable(List<TextPositionSequence> textPositionSequences, Rectangle2D tableBBox) {
|
||||
|
||||
LineInformation lineInformation = LineDetectionService.calculateLineInformation(textPositionSequences);
|
||||
GapInformation gaps = GapDetectionService.findGapsInLines(textPositionSequences, tableBBox);
|
||||
List<Rectangle2D> gapsAcrossLines = GapsAcrossLinesService.detectXGapsAcrossLines(gaps, tableBBox);
|
||||
List<Double> columnXCoords = gapsAcrossLines.stream().map(RectangularShape::getCenterX).toList();
|
||||
int colCount = gapsAcrossLines.size();
|
||||
int rowCount = lineInformation.getLineBBox().size();
|
||||
List<List<Rectangle2D>> cells = new LinkedList<>();
|
||||
List<Rectangle2D> cellsInLine = new LinkedList<>();
|
||||
cells.add(cellsInLine);
|
||||
double x1;
|
||||
double y1;
|
||||
double x2;
|
||||
double y2;
|
||||
for (int col = 0; col < colCount + 1; col++) {
|
||||
for (int row = 0; row < rowCount + 1; row++) {
|
||||
if (col == 0) {
|
||||
x1 = tableBBox.getX();
|
||||
} else {
|
||||
x1 = columnXCoords.get(col - 1);
|
||||
}
|
||||
if (row == 0) {
|
||||
y2 = tableBBox.getMaxY();
|
||||
} else {
|
||||
y2 = lineInformation.getLineBBox().get(row - 1).getY();
|
||||
}
|
||||
if (col == colCount) {
|
||||
x2 = tableBBox.getMaxX();
|
||||
} else {
|
||||
x2 = columnXCoords.get(col);
|
||||
}
|
||||
if (row == rowCount) {
|
||||
y1 = tableBBox.getY();
|
||||
} else {
|
||||
y1 = lineInformation.getLineBBox().get(row).getY();
|
||||
}
|
||||
cellsInLine.add(new Rectangle2D.Double(x1, y1, x2 - x1, y2 - y1));
|
||||
}
|
||||
cellsInLine = new LinkedList<>();
|
||||
cells.add(cellsInLine);
|
||||
}
|
||||
|
||||
return cells;
|
||||
}
|
||||
|
||||
}
|
||||
@ -0,0 +1,202 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.services;
|
||||
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.GapInformation;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.LineInformation;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
|
||||
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Getter;
|
||||
import lombok.experimental.UtilityClass;
|
||||
|
||||
@UtilityClass
|
||||
public class LineDetectionService {
|
||||
|
||||
private static final double X_GAP_FACTOR = 1; // multiplied with average text height, determines the minimum distance of gaps in lines
|
||||
|
||||
|
||||
public LineInformation calculateLineInformation(List<TextPositionSequence> sortedTextPositionSequences) {
|
||||
|
||||
if (sortedTextPositionSequences.isEmpty()) {
|
||||
return LineFactory.init().build();
|
||||
}
|
||||
|
||||
return buildLineInformation(sortedTextPositionSequences);
|
||||
}
|
||||
|
||||
|
||||
public List<List<Rectangle2D>> findLinesWithGaps(List<TextPositionSequence> sortedTextPositionSequences, Rectangle2D mainBodyTextFrame) {
|
||||
|
||||
return calculateLineInformation(sortedTextPositionSequences).getBBoxWithGapsByLines();
|
||||
}
|
||||
|
||||
|
||||
public List<List<TextPositionSequence>> orderByLines(List<TextPositionSequence> sortedTextPositionSequences, Rectangle2D mainBodyTextFrame) {
|
||||
|
||||
return calculateLineInformation(sortedTextPositionSequences).getSequencesByLines();
|
||||
}
|
||||
|
||||
|
||||
private static LineInformation buildLineInformation(List<TextPositionSequence> sortedTextPositionSequences) {
|
||||
|
||||
final double avgTextPositionHeight = getAvgTextPositionHeight(sortedTextPositionSequences);
|
||||
|
||||
LineFactory lineFactory = LineFactory.init();
|
||||
|
||||
var previousTextPosition = sortedTextPositionSequences.get(0);
|
||||
lineFactory.addToCurrentLine(previousTextPosition);
|
||||
for (TextPositionSequence currentTextPosition : sortedTextPositionSequences.subList(1, sortedTextPositionSequences.size())) {
|
||||
if (isNewLine(currentTextPosition, previousTextPosition, avgTextPositionHeight) || isSplitByOrientation(currentTextPosition, previousTextPosition)) {
|
||||
lineFactory.startNewLine();
|
||||
} else if (isXGap(currentTextPosition, previousTextPosition, avgTextPositionHeight)) {
|
||||
lineFactory.startNewBlock();
|
||||
}
|
||||
lineFactory.addToCurrentLine(currentTextPosition);
|
||||
previousTextPosition = currentTextPosition;
|
||||
}
|
||||
lineFactory.addFinalLine();
|
||||
return lineFactory.build();
|
||||
}
|
||||
|
||||
|
||||
private static double getAvgTextPositionHeight(List<TextPositionSequence> textPositionSequences) {
|
||||
|
||||
return textPositionSequences.stream().mapToDouble(TextPositionSequence::getHeight).average().orElseThrow();
|
||||
}
|
||||
|
||||
|
||||
private static boolean isXGap(TextPositionSequence currentTextPosition, TextPositionSequence previousTextPosition, double avgTextPositionHeight) {
|
||||
|
||||
return Math.abs(previousTextPosition.getMaxXDirAdj() - currentTextPosition.getMinXDirAdj()) > (avgTextPositionHeight * X_GAP_FACTOR);
|
||||
}
|
||||
|
||||
|
||||
private static boolean isSplitByOrientation(TextPositionSequence currentTextPosition, TextPositionSequence previousTextPosition) {
|
||||
|
||||
return !previousTextPosition.getDir().equals(currentTextPosition.getDir());
|
||||
}
|
||||
|
||||
|
||||
private static boolean isNewLine(TextPositionSequence currentTextPosition, TextPositionSequence previousTextPosition, double avgTextPositionHeight) {
|
||||
|
||||
return Math.abs(previousTextPosition.getMinYDirAdj() - currentTextPosition.getMinYDirAdj()) > avgTextPositionHeight;
|
||||
}
|
||||
|
||||
|
||||
@Getter
|
||||
@AllArgsConstructor
|
||||
private class LineFactory {
|
||||
|
||||
List<Rectangle2D> lineBBox;
|
||||
|
||||
List<List<Rectangle2D>> bBoxWithGapsByLines;
|
||||
List<Rectangle2D> bBoxWithGapsInCurrentLine;
|
||||
|
||||
List<List<List<TextPositionSequence>>> sequencesWithGapsByLines;
|
||||
List<List<TextPositionSequence>> sequencesWithGapsInCurrentLine;
|
||||
|
||||
List<TextPositionSequence> currentSequencesWithoutGaps;
|
||||
|
||||
List<List<TextPositionSequence>> sequencesByLines;
|
||||
List<TextPositionSequence> sequencesInCurrentLine;
|
||||
|
||||
List<List<Rectangle2D>> xGaps;
|
||||
List<List<Rectangle2D>> yGaps;
|
||||
|
||||
|
||||
public static LineFactory init() {
|
||||
|
||||
List<Rectangle2D> lineBBox = new LinkedList<>();
|
||||
|
||||
List<List<Rectangle2D>> bBoxWithGapsByLines = new LinkedList<>();
|
||||
List<Rectangle2D> bBoxWithGapsInCurrentLine = new LinkedList<>();
|
||||
bBoxWithGapsByLines.add(bBoxWithGapsInCurrentLine);
|
||||
|
||||
List<List<List<TextPositionSequence>>> sequencesWithGapsByLines = new LinkedList<>();
|
||||
List<List<TextPositionSequence>> sequencesWithGapsInCurrentLine = new LinkedList<>();
|
||||
sequencesWithGapsByLines.add(sequencesWithGapsInCurrentLine);
|
||||
List<TextPositionSequence> currentSequencesWithoutGaps = new LinkedList<>();
|
||||
sequencesWithGapsInCurrentLine.add(currentSequencesWithoutGaps);
|
||||
|
||||
List<List<TextPositionSequence>> sequencesByLines = new LinkedList<>();
|
||||
List<TextPositionSequence> sequencesInCurrentLine = new LinkedList<>();
|
||||
sequencesByLines.add(sequencesInCurrentLine);
|
||||
|
||||
return new LineFactory(lineBBox,
|
||||
bBoxWithGapsByLines,
|
||||
bBoxWithGapsInCurrentLine,
|
||||
sequencesWithGapsByLines,
|
||||
sequencesWithGapsInCurrentLine,
|
||||
currentSequencesWithoutGaps,
|
||||
sequencesByLines,
|
||||
sequencesInCurrentLine,
|
||||
null,
|
||||
null);
|
||||
}
|
||||
|
||||
|
||||
public void addGaps(GapInformation gapInformation) {
|
||||
|
||||
this.xGaps = gapInformation.getXGaps();
|
||||
this.yGaps = gapInformation.getYGaps();
|
||||
}
|
||||
|
||||
|
||||
public LineInformation build() {
|
||||
|
||||
return new LineInformation(lineBBox, sequencesByLines, bBoxWithGapsByLines, sequencesWithGapsByLines);
|
||||
}
|
||||
|
||||
|
||||
public void startNewBlock() {
|
||||
|
||||
bBoxWithGapsInCurrentLine.add(textPositionBBox(currentSequencesWithoutGaps));
|
||||
currentSequencesWithoutGaps = new LinkedList<>();
|
||||
sequencesWithGapsInCurrentLine.add(currentSequencesWithoutGaps);
|
||||
}
|
||||
|
||||
|
||||
public void startNewLine() {
|
||||
|
||||
lineBBox.add(textPositionBBox(sequencesInCurrentLine));
|
||||
|
||||
bBoxWithGapsInCurrentLine.add(textPositionBBox(currentSequencesWithoutGaps));
|
||||
bBoxWithGapsInCurrentLine = new LinkedList<>();
|
||||
bBoxWithGapsByLines.add(bBoxWithGapsInCurrentLine);
|
||||
|
||||
sequencesWithGapsInCurrentLine = new LinkedList<>();
|
||||
sequencesWithGapsByLines.add(sequencesWithGapsInCurrentLine);
|
||||
currentSequencesWithoutGaps = new LinkedList<>();
|
||||
sequencesWithGapsInCurrentLine.add(currentSequencesWithoutGaps);
|
||||
|
||||
sequencesInCurrentLine = new LinkedList<>();
|
||||
sequencesByLines.add(sequencesInCurrentLine);
|
||||
}
|
||||
|
||||
|
||||
private Rectangle2D textPositionBBox(List<TextPositionSequence> textPositionSequences) {
|
||||
|
||||
return RectangleTransformations.rectangleBBox(textPositionSequences.stream().map(TextPositionSequence::getRectangle).toList());
|
||||
}
|
||||
|
||||
|
||||
public void addToCurrentLine(TextPositionSequence current) {
|
||||
|
||||
sequencesInCurrentLine.add(current);
|
||||
currentSequencesWithoutGaps.add(current);
|
||||
}
|
||||
|
||||
|
||||
public void addFinalLine() {
|
||||
|
||||
lineBBox.add(textPositionBBox(sequencesInCurrentLine));
|
||||
bBoxWithGapsInCurrentLine.add(textPositionBBox(currentSequencesWithoutGaps));
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
@ -0,0 +1,25 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.services;
|
||||
|
||||
import java.awt.geom.Rectangle2D;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.LineInformation;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations;
|
||||
|
||||
import lombok.experimental.UtilityClass;
|
||||
|
||||
@UtilityClass
|
||||
public class MainBodyTextFrameExtractionService {
|
||||
|
||||
private static final double TEXT_FRAME_PAD_WIDTH = 0.0;
|
||||
private static final double TEXT_FRAME_PAD_HEIGHT = 0.02;
|
||||
|
||||
|
||||
public Rectangle2D calculateMainBodyTextFrame(LineInformation lineInformation) {
|
||||
|
||||
Rectangle2D mainBodyTextFrame = lineInformation.getLineBBox().stream()
|
||||
.collect(RectangleTransformations.collectBBox());
|
||||
|
||||
return RectangleTransformations.pad(mainBodyTextFrame, mainBodyTextFrame.getWidth() * TEXT_FRAME_PAD_WIDTH, mainBodyTextFrame.getHeight() * TEXT_FRAME_PAD_HEIGHT);
|
||||
}
|
||||
|
||||
}
|
||||
@ -0,0 +1,24 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.services;
|
||||
|
||||
import java.awt.geom.Rectangle2D;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.GapInformation;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.LineInformation;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.PageContents;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.PageInformation;
|
||||
|
||||
import lombok.experimental.UtilityClass;
|
||||
|
||||
@UtilityClass
|
||||
public class PageInformationService {
|
||||
|
||||
public PageInformation build(PageContents pageContents) {
|
||||
|
||||
LineInformation lineInformation = LineDetectionService.calculateLineInformation(pageContents.getSortedTextPositionSequences());
|
||||
Rectangle2D mainBodyTextFrame = MainBodyTextFrameExtractionService.calculateMainBodyTextFrame(lineInformation);
|
||||
GapInformation gapInformation = GapDetectionService.findGapsInLines(pageContents.getSortedTextPositionSequences(), mainBodyTextFrame);
|
||||
|
||||
return new PageInformation(pageContents, lineInformation, mainBodyTextFrame, gapInformation);
|
||||
}
|
||||
|
||||
}
|
||||
@ -1,4 +1,4 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.classification.service;
|
||||
package com.knecon.fforesight.service.layoutparser.processor.services;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
@ -9,16 +9,20 @@ import org.apache.pdfbox.pdmodel.PDPage;
|
||||
import org.apache.pdfbox.pdmodel.common.PDRectangle;
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.adapter.ImageServiceResponseAdapter;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.adapter.model.table.TableCells;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.model.AbstractPageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.model.ClassificationDocument;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.model.ClassificationPage;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.model.image.ClassifiedImage;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.model.table.CleanRulings;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.model.text.TextPageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.model.text.TextPositionSequence;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.parsing.PDFLinesTextStripper;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationDocument;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.image.ClassifiedImage;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.parsing.PDFLinesTextStripper;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.blockification.DocuMineBlockificationService;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.blockification.RedactManagerBlockificationService;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.blockification.TaasBlockificationService;
|
||||
|
||||
import lombok.RequiredArgsConstructor;
|
||||
import lombok.SneakyThrows;
|
||||
@ -31,11 +35,16 @@ public class PdfParsingService {
|
||||
|
||||
private final RulingCleaningService rulingCleaningService;
|
||||
private final TableExtractionService tableExtractionService;
|
||||
private final BlockificationService blockificationService;
|
||||
private final ImageServiceResponseAdapter imageServiceResponseAdapter;
|
||||
private final TaasBlockificationService taasBlockificationService;
|
||||
private final DocuMineBlockificationService docuMineBlockificationService;
|
||||
private final RedactManagerBlockificationService redactManagerBlockificationService;
|
||||
|
||||
|
||||
public ClassificationDocument parseDocument(PDDocument originDocument, Map<Integer, List<TableCells>> pdfTableCells, Map<Integer, List<ClassifiedImage>> pdfImages) {
|
||||
public ClassificationDocument parseDocument(LayoutParsingType layoutParsingType,
|
||||
PDDocument originDocument,
|
||||
Map<Integer, List<TableCells>> pdfTableCells,
|
||||
Map<Integer, List<ClassifiedImage>> pdfImages) {
|
||||
|
||||
ClassificationDocument document = new ClassificationDocument();
|
||||
List<ClassificationPage> classificationPages = new ArrayList<>();
|
||||
@ -44,7 +53,7 @@ public class PdfParsingService {
|
||||
long pageCount = originDocument.getNumberOfPages();
|
||||
|
||||
for (int pageNumber = 1; pageNumber <= pageCount; pageNumber++) {
|
||||
parsePage(pdfImages, originDocument, pdfTableCells, document, classificationPages, pageNumber);
|
||||
parsePage(layoutParsingType, pdfImages, originDocument, pdfTableCells, document, classificationPages, pageNumber);
|
||||
}
|
||||
|
||||
document.setPages(classificationPages);
|
||||
@ -54,7 +63,8 @@ public class PdfParsingService {
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
private void parsePage(Map<Integer, List<ClassifiedImage>> pdfImages,
|
||||
private void parsePage(LayoutParsingType layoutParsingType,
|
||||
Map<Integer, List<ClassifiedImage>> pdfImages,
|
||||
PDDocument pdDocument,
|
||||
Map<Integer, List<TableCells>> pdfTableCells,
|
||||
ClassificationDocument document,
|
||||
@ -79,7 +89,12 @@ public class PdfParsingService {
|
||||
stripper.getRulings(),
|
||||
stripper.getMinCharWidth(),
|
||||
stripper.getMaxCharHeight());
|
||||
ClassificationPage classificationPage = blockificationService.blockify(stripper.getTextPositionSequences(), cleanRulings.getHorizontal(), cleanRulings.getVertical());
|
||||
|
||||
ClassificationPage classificationPage = switch (layoutParsingType) {
|
||||
case REDACT_MANAGER -> redactManagerBlockificationService.blockify(stripper.getTextPositionSequences(), cleanRulings.getHorizontal(), cleanRulings.getVertical());
|
||||
case TAAS -> taasBlockificationService.blockify(stripper.getTextPositionSequences(), cleanRulings.getHorizontal(), cleanRulings.getVertical());
|
||||
case DOCUMINE -> docuMineBlockificationService.blockify(stripper.getTextPositionSequences(), cleanRulings.getHorizontal(), cleanRulings.getVertical());
|
||||
};
|
||||
|
||||
classificationPage.setRotation(rotation);
|
||||
classificationPage.setLandscape(isLandscape);
|
||||
@ -1,95 +0,0 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.services;
|
||||
|
||||
import static java.lang.String.format;
|
||||
|
||||
import java.awt.geom.Area;
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
import java.util.function.BiConsumer;
|
||||
import java.util.function.BinaryOperator;
|
||||
import java.util.function.Function;
|
||||
import java.util.function.Supplier;
|
||||
import java.util.stream.Collector;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.graph.textblock.AtomicTextBlock;
|
||||
|
||||
import lombok.experimental.UtilityClass;
|
||||
|
||||
@UtilityClass
|
||||
public class RectangleTransformations {
|
||||
|
||||
public static Rectangle2D pad(Rectangle2D rectangle2D, int deltaX, int deltaY) {
|
||||
|
||||
return new Rectangle2D.Double(rectangle2D.getMinX() - deltaX, rectangle2D.getMinY() - deltaY, rectangle2D.getWidth() + 2 * deltaX, rectangle2D.getHeight() + 2 * deltaY);
|
||||
}
|
||||
|
||||
|
||||
public static Rectangle2D bBoxUnionAtomicTextBlock(List<AtomicTextBlock> atomicTextBlocks) {
|
||||
|
||||
return atomicTextBlocks.stream().flatMap(atomicTextBlock -> atomicTextBlock.getPositions().stream()).collect(new Rectangle2DUnion());
|
||||
}
|
||||
|
||||
|
||||
public static Rectangle2D rectangleUnion(List<Rectangle2D> rectangle2DList) {
|
||||
|
||||
return rectangle2DList.stream().collect(new Rectangle2DUnion());
|
||||
}
|
||||
|
||||
|
||||
public static String toString(Rectangle2D rectangle2D) {
|
||||
|
||||
return format("%f,%f,%f,%f", rectangle2D.getX(), rectangle2D.getY(), rectangle2D.getWidth(), rectangle2D.getHeight());
|
||||
}
|
||||
|
||||
|
||||
public static Rectangle2D parseRectangle2D(String bBox) {
|
||||
|
||||
List<Float> floats = Arrays.stream(bBox.split(",")).map(Float::parseFloat).toList();
|
||||
return new Rectangle2D.Float(floats.get(0), floats.get(1), floats.get(2), floats.get(3));
|
||||
}
|
||||
|
||||
|
||||
private static class Rectangle2DUnion implements Collector<Rectangle2D, Area, Rectangle2D> {
|
||||
|
||||
@Override
|
||||
public Supplier<Area> supplier() {
|
||||
|
||||
return Area::new;
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public BiConsumer<Area, Rectangle2D> accumulator() {
|
||||
|
||||
return (area, rectangle2D) -> area.add(new Area(rectangle2D));
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public BinaryOperator<Area> combiner() {
|
||||
|
||||
return (area1, area2) -> {
|
||||
area1.add(area2);
|
||||
return area1;
|
||||
};
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public Function<Area, Rectangle2D> finisher() {
|
||||
|
||||
return Area::getBounds2D;
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public Set<Characteristics> characteristics() {
|
||||
|
||||
return Set.of(Characteristics.CONCURRENT, Characteristics.UNORDERED);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
@ -1,4 +1,4 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.classification.service;
|
||||
package com.knecon.fforesight.service.layoutparser.processor.services;
|
||||
|
||||
import java.awt.geom.Line2D;
|
||||
import java.awt.geom.Point2D;
|
||||
@ -13,9 +13,9 @@ import java.util.Map;
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.adapter.model.table.TableCells;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.model.table.CleanRulings;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.model.table.Ruling;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.utils.DoubleComparisons;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.DoubleComparisons;
|
||||
|
||||
import lombok.RequiredArgsConstructor;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
@ -0,0 +1,146 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.services;
|
||||
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Objects;
|
||||
import java.util.Set;
|
||||
import java.util.function.BiConsumer;
|
||||
import java.util.function.BinaryOperator;
|
||||
import java.util.function.Function;
|
||||
import java.util.function.Supplier;
|
||||
import java.util.stream.Collector;
|
||||
import java.util.stream.Stream;
|
||||
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.Point;
|
||||
import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.section.CellRectangle;
|
||||
import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.section.SectionGrid;
|
||||
import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.section.SectionRectangle;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeType;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.Document;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.Page;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.SemanticNode;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.Table;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.graph.nodes.TableCell;
|
||||
|
||||
import lombok.RequiredArgsConstructor;
|
||||
|
||||
@Service
|
||||
@RequiredArgsConstructor
|
||||
public class SectionGridCreatorService {
|
||||
|
||||
public SectionGrid createSectionGrid(Document document) {
|
||||
|
||||
Map<Integer, List<SectionRectangle>> sectionBBox = document.streamAllSubNodesOfType(NodeType.SECTION).map(SemanticNode::getBBox).collect(new SectionGridCollector());
|
||||
Map<Integer, List<SectionRectangle>> paragraphBBox = document.streamAllSubNodesOfType(NodeType.PARAGRAPH).map(SemanticNode::getBBox).collect(new SectionGridCollector());
|
||||
Map<Integer, List<SectionRectangle>> headlineBBox = document.streamAllSubNodesOfType(NodeType.HEADLINE).map(SemanticNode::getBBox).collect(new SectionGridCollector());
|
||||
Map<Integer, List<SectionRectangle>> tableBBox = document.streamAllSubNodesOfType(NodeType.TABLE).map(node -> (Table) node).collect(new TableGridCollector());
|
||||
var sectionGrid = new SectionGrid();
|
||||
|
||||
sectionGrid.setRectanglesPerPage(mergeMapsByConcatenatingLists(//
|
||||
mergeMapsByConcatenatingLists(paragraphBBox, headlineBBox), //
|
||||
mergeMapsByConcatenatingLists(sectionBBox, tableBBox)));
|
||||
|
||||
return sectionGrid;
|
||||
}
|
||||
|
||||
|
||||
private static abstract class GridCollector<T> implements Collector<T, Map<Integer, List<SectionRectangle>>, Map<Integer, List<SectionRectangle>>> {
|
||||
|
||||
@Override
|
||||
public Supplier<Map<Integer, List<SectionRectangle>>> supplier() {
|
||||
|
||||
return HashMap::new;
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public Function<Map<Integer, List<SectionRectangle>>, Map<Integer, List<SectionRectangle>>> finisher() {
|
||||
|
||||
return Function.identity();
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public BinaryOperator<Map<Integer, List<SectionRectangle>>> combiner() {
|
||||
|
||||
return SectionGridCreatorService::mergeMapsByConcatenatingLists;
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public Set<Characteristics> characteristics() {
|
||||
|
||||
return Set.of(Characteristics.IDENTITY_FINISH, Characteristics.CONCURRENT, Characteristics.UNORDERED);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
private static class TableGridCollector extends GridCollector<Table> {
|
||||
|
||||
@Override
|
||||
public BiConsumer<Map<Integer, List<SectionRectangle>>, Table> accumulator() {
|
||||
|
||||
return (map, table) -> table.getPages()
|
||||
.forEach(page -> map.merge(page.getNumber(), List.of(toSectionRectangle(table, page, table.getPages().size())), SectionGridCreatorService::concatLists));
|
||||
}
|
||||
|
||||
|
||||
private static SectionRectangle toSectionRectangle(Table table, Page page, int numberOfParts) {
|
||||
|
||||
Rectangle2D rect = table.getBBox().get(page);
|
||||
List<CellRectangle> tableCellRectangles = table.streamTableCells()
|
||||
.map(TableCell::getBBox)
|
||||
.map(map -> map.get(page))
|
||||
.filter(Objects::nonNull)
|
||||
.map(rectangle2D -> new CellRectangle(new Point((float) rectangle2D.getX(), (float) rectangle2D.getY()),
|
||||
(float) rectangle2D.getWidth(),
|
||||
(float) rectangle2D.getHeight()))
|
||||
.toList();
|
||||
return new SectionRectangle(new Point((float) rect.getX(), (float) rect.getY()),
|
||||
(float) rect.getWidth(),
|
||||
(float) rect.getHeight(),
|
||||
1,
|
||||
numberOfParts,
|
||||
tableCellRectangles);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
private static class SectionGridCollector extends GridCollector<Map<Page, Rectangle2D>> {
|
||||
|
||||
@Override
|
||||
public BiConsumer<Map<Integer, List<SectionRectangle>>, Map<Page, Rectangle2D>> accumulator() {
|
||||
|
||||
return (mapToKeep, mapToMerge) -> mapToMerge.forEach((page, rectangle) -> mapToKeep.merge(page.getNumber(),
|
||||
List.of(toSectionRectangle(rectangle, mapToMerge.values().size())),
|
||||
SectionGridCreatorService::concatLists));
|
||||
|
||||
}
|
||||
|
||||
|
||||
private static SectionRectangle toSectionRectangle(Rectangle2D rect, int numberOfParts) {
|
||||
|
||||
return new SectionRectangle(new Point((float) rect.getX(), (float) rect.getY()), (float) rect.getWidth(), (float) rect.getHeight(), 1, numberOfParts, null);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
private static Map<Integer, List<SectionRectangle>> mergeMapsByConcatenatingLists(Map<Integer, List<SectionRectangle>> mapToKeep,
|
||||
Map<Integer, List<SectionRectangle>> mapToMerge) {
|
||||
|
||||
mapToMerge.forEach((page, rectangle) -> mapToKeep.merge(page, rectangle, SectionGridCreatorService::concatLists));
|
||||
return mapToKeep;
|
||||
}
|
||||
|
||||
|
||||
private static List<SectionRectangle> concatLists(List<SectionRectangle> l1, List<SectionRectangle> l2) {
|
||||
|
||||
return Stream.concat(l1.stream(), l2.stream()).toList();
|
||||
}
|
||||
|
||||
}
|
||||
@ -1,4 +1,4 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.classification.service;
|
||||
package com.knecon.fforesight.service.layoutparser.processor.services;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collections;
|
||||
@ -9,18 +9,18 @@ import java.util.stream.Collectors;
|
||||
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.model.AbstractPageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.model.ClassificationDocument;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.model.ClassificationFooter;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.model.ClassificationHeader;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.model.ClassificationPage;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.model.ClassificationSection;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.model.PageBlockType;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.model.image.ClassifiedImage;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.model.table.Cell;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.model.table.TablePageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.model.text.TextPageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.model.text.UnclassifiedText;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationDocument;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationFooter;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationHeader;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationSection;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.PageBlockType;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.image.ClassifiedImage;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.Cell;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.TablePageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.UnclassifiedText;
|
||||
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
|
||||
@ -1,4 +1,4 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.classification.service;
|
||||
package com.knecon.fforesight.service.layoutparser.processor.services;
|
||||
|
||||
import java.awt.geom.Point2D;
|
||||
import java.util.ArrayList;
|
||||
@ -12,15 +12,15 @@ import java.util.Set;
|
||||
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.model.AbstractPageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.model.ClassificationPage;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.model.table.Cell;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.model.table.CleanRulings;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.model.table.Rectangle;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.model.table.Ruling;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.model.table.TablePageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.model.text.TextPageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.utils.DoubleComparisons;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.Cell;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.Rectangle;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.TablePageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.DoubleComparisons;
|
||||
|
||||
@Service
|
||||
public class TableExtractionService {
|
||||
@ -136,6 +136,14 @@ public class TableExtractionService {
|
||||
|
||||
public List<Cell> findCells(List<Ruling> horizontalRulingLines, List<Ruling> verticalRulingLines) {
|
||||
|
||||
for (Ruling r : horizontalRulingLines) {
|
||||
if (r.getX2() < r.getX1()) {
|
||||
double a = r.getX2();
|
||||
r.x2 = (float) r.getX1();
|
||||
r.x1 = (float) a;
|
||||
}
|
||||
}
|
||||
|
||||
List<Cell> cellsFound = new ArrayList<>();
|
||||
Map<Point2D, Ruling[]> intersectionPoints = Ruling.findIntersections(horizontalRulingLines, verticalRulingLines);
|
||||
List<Point2D> intersectionPointsList = new ArrayList<>(intersectionPoints.keySet());
|
||||
@ -0,0 +1,75 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.services;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.util.Collection;
|
||||
import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import org.apache.pdfbox.Loader;
|
||||
import org.apache.pdfbox.pdmodel.PDDocument;
|
||||
import org.apache.pdfbox.pdmodel.PDPage;
|
||||
import org.springframework.core.io.ClassPathResource;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.PageContents;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.parsing.PDFLinesTextStripper;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations;
|
||||
|
||||
import lombok.experimental.UtilityClass;
|
||||
|
||||
@UtilityClass
|
||||
public class TextPositionSequenceSorter {
|
||||
|
||||
public List<PageContents> getSortedTextPositionsWithPages(String filename) throws IOException {
|
||||
|
||||
List<PageContents> textPositionSequencesPerPage = new LinkedList<>();
|
||||
try (InputStream inputStream = new ClassPathResource(filename).getInputStream()) {
|
||||
|
||||
PDDocument pdDocument = Loader.loadPDF(inputStream);
|
||||
|
||||
for (int pageNumber = 1; pageNumber < pdDocument.getNumberOfPages() + 1; pageNumber++) {
|
||||
|
||||
PDFLinesTextStripper stripper = new PDFLinesTextStripper();
|
||||
PDPage pdPage = pdDocument.getPage(pageNumber - 1);
|
||||
stripper.setPageNumber(pageNumber);
|
||||
stripper.setSortByPosition(true);
|
||||
stripper.setStartPage(pageNumber);
|
||||
stripper.setEndPage(pageNumber);
|
||||
stripper.setPdpage(pdPage);
|
||||
stripper.getText(pdDocument);
|
||||
|
||||
Map<Float, List<TextPositionSequence>> sortedTextPositionSequencesPerDir = stripper.getTextPositionSequences()
|
||||
.stream()
|
||||
.collect(Collectors.groupingBy(textPositionSequence -> textPositionSequence.getDir().getDegrees()));
|
||||
|
||||
var sortedTextPositionSequences = sortByDirAccordingToPageRotation(sortedTextPositionSequencesPerDir, pdPage.getRotation());
|
||||
|
||||
textPositionSequencesPerPage.add(new PageContents(sortedTextPositionSequences,
|
||||
RectangleTransformations.toRectangle2D(pdPage.getCropBox()),
|
||||
RectangleTransformations.toRectangle2D(pdPage.getMediaBox())));
|
||||
}
|
||||
|
||||
pdDocument.close();
|
||||
}
|
||||
|
||||
return textPositionSequencesPerPage;
|
||||
}
|
||||
|
||||
|
||||
public List<TextPositionSequence> sortByDirAccordingToPageRotation(Map<Float, List<TextPositionSequence>> sortedTextPositionSequencesPerDir, int rotation) {
|
||||
|
||||
LinkedList<Float> sortedKeys = new LinkedList<>(sortedTextPositionSequencesPerDir.keySet().stream().sorted().toList());
|
||||
|
||||
for (int i = 0; i < sortedKeys.size(); i++) {
|
||||
if (sortedKeys.get(i) < rotation) {
|
||||
Float keyToSwap = sortedKeys.remove(i);
|
||||
sortedKeys.addLast(keyToSwap);
|
||||
}
|
||||
}
|
||||
return sortedKeys.stream().map(sortedTextPositionSequencesPerDir::get).flatMap(Collection::stream).toList();
|
||||
}
|
||||
|
||||
}
|
||||
@ -0,0 +1,229 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.services.blockification;
|
||||
|
||||
import static java.util.stream.Collectors.toSet;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Comparator;
|
||||
import java.util.List;
|
||||
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.FloatFrequencyCounter;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.Orientation;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.StringFrequencyCounter;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.RulingTextDirAdjustUtil;
|
||||
|
||||
@Service
|
||||
public class DocuMineBlockificationService {
|
||||
|
||||
static final float THRESHOLD = 1f;
|
||||
|
||||
|
||||
/**
|
||||
* This method is building blocks by expanding the minX/maxX and minY/maxY value on each word that is not split by the conditions.
|
||||
* This method must use text direction adjusted postions (DirAdj). Where {0,0} is on the upper left. Never try to change this!
|
||||
* Rulings (Table lines) must be adjusted to the text directions as well, when checking if a block is split by a ruling.
|
||||
*
|
||||
* @param textPositions The words of a page.
|
||||
* @param horizontalRulingLines Horizontal table lines.
|
||||
* @param verticalRulingLines Vertical table lines.
|
||||
* @return Page object that contains the Textblock and text statistics.
|
||||
*/
|
||||
public ClassificationPage blockify(List<TextPositionSequence> textPositions, List<Ruling> horizontalRulingLines, List<Ruling> verticalRulingLines) {
|
||||
|
||||
List<TextPositionSequence> chunkWords = new ArrayList<>();
|
||||
List<AbstractPageBlock> chunkBlockList1 = new ArrayList<>();
|
||||
|
||||
float minX = 1000, maxX = 0, minY = 1000, maxY = 0;
|
||||
TextPositionSequence prev = null;
|
||||
|
||||
boolean wasSplitted = false;
|
||||
Float splitX1 = null;
|
||||
for (TextPositionSequence word : textPositions) {
|
||||
|
||||
boolean lineSeparation = prev != null && word.getMinYDirAdj() - maxY > Math.min(word.getHeight(), prev.getHeight()) * 1.25;
|
||||
boolean startFromTop = prev != null && word.getMinYDirAdj() < prev.getMinYDirAdj() - prev.getTextHeight();
|
||||
boolean splitByX = prev != null && maxX + 50 < word.getMinXDirAdj() && prev.getMinYDirAdj() == word.getMinYDirAdj();
|
||||
boolean negativeXGap = prev != null && word.getMinXDirAdj() - minX < -5;
|
||||
boolean newLineAfterSplit = prev != null && word.getMinYDirAdj() != prev.getMinYDirAdj() && wasSplitted && splitX1 != word.getMinXDirAdj();
|
||||
boolean isSplitByRuling = isSplitByRuling(minX, minY, maxX, maxY, word, horizontalRulingLines, verticalRulingLines);
|
||||
boolean splitByDir = prev != null && !prev.getDir().equals(word.getDir());
|
||||
boolean splitByOtherFontAndOtherY = prev != null && prev.getMaxYDirAdj() != word.getMaxYDirAdj() && (word.getFontStyle().contains("bold") && !prev.getFontStyle()
|
||||
.contains("bold") || prev.getFontStyle().contains("bold") && !word.getFontStyle().contains("bold"));
|
||||
|
||||
if (prev != null && (lineSeparation || startFromTop || splitByDir || isSplitByRuling || splitByOtherFontAndOtherY || negativeXGap)) {
|
||||
|
||||
Orientation prevOrientation = null;
|
||||
if (!chunkBlockList1.isEmpty()) {
|
||||
prevOrientation = chunkBlockList1.get(chunkBlockList1.size() - 1).getOrientation();
|
||||
}
|
||||
|
||||
TextPageBlock cb1 = buildTextBlock(chunkWords);
|
||||
chunkBlockList1.add(cb1);
|
||||
chunkWords = new ArrayList<>();
|
||||
|
||||
if (splitByX && !isSplitByRuling) {
|
||||
wasSplitted = true;
|
||||
cb1.setOrientation(Orientation.LEFT);
|
||||
splitX1 = word.getMinXDirAdj();
|
||||
} else if (newLineAfterSplit && !isSplitByRuling) {
|
||||
wasSplitted = false;
|
||||
cb1.setOrientation(Orientation.RIGHT);
|
||||
splitX1 = null;
|
||||
} else if (prevOrientation != null && prevOrientation.equals(Orientation.RIGHT) && (lineSeparation || !startFromTop || !splitByX || !newLineAfterSplit || !isSplitByRuling)) {
|
||||
cb1.setOrientation(Orientation.LEFT);
|
||||
}
|
||||
|
||||
minX = 1000;
|
||||
maxX = 0;
|
||||
minY = 1000;
|
||||
maxY = 0;
|
||||
prev = null;
|
||||
}
|
||||
|
||||
chunkWords.add(word);
|
||||
|
||||
prev = word;
|
||||
if (word.getMinXDirAdj() < minX) {
|
||||
minX = word.getMinXDirAdj();
|
||||
}
|
||||
if (word.getMaxXDirAdj() > maxX) {
|
||||
maxX = word.getMaxXDirAdj();
|
||||
}
|
||||
if (word.getMinYDirAdj() < minY) {
|
||||
minY = word.getMinYDirAdj();
|
||||
}
|
||||
if (word.getMaxYDirAdj() > maxY) {
|
||||
maxY = word.getMaxYDirAdj();
|
||||
}
|
||||
}
|
||||
|
||||
TextPageBlock cb1 = buildTextBlock(chunkWords);
|
||||
if (cb1 != null) {
|
||||
chunkBlockList1.add(cb1);
|
||||
}
|
||||
|
||||
return new ClassificationPage(chunkBlockList1);
|
||||
}
|
||||
|
||||
|
||||
private boolean equalsWithThreshold(float f1, float f2) {
|
||||
|
||||
return Math.abs(f1 - f2) < THRESHOLD;
|
||||
}
|
||||
|
||||
|
||||
private TextPageBlock buildTextBlock(List<TextPositionSequence> wordBlockList) {
|
||||
|
||||
TextPageBlock textBlock = null;
|
||||
|
||||
FloatFrequencyCounter lineHeightFrequencyCounter = new FloatFrequencyCounter();
|
||||
FloatFrequencyCounter fontSizeFrequencyCounter = new FloatFrequencyCounter();
|
||||
FloatFrequencyCounter spaceFrequencyCounter = new FloatFrequencyCounter();
|
||||
StringFrequencyCounter fontFrequencyCounter = new StringFrequencyCounter();
|
||||
StringFrequencyCounter styleFrequencyCounter = new StringFrequencyCounter();
|
||||
|
||||
for (TextPositionSequence wordBlock : wordBlockList) {
|
||||
|
||||
lineHeightFrequencyCounter.add(wordBlock.getTextHeight());
|
||||
fontSizeFrequencyCounter.add(wordBlock.getFontSize());
|
||||
spaceFrequencyCounter.add(wordBlock.getSpaceWidth());
|
||||
fontFrequencyCounter.add(wordBlock.getFont());
|
||||
styleFrequencyCounter.add(wordBlock.getFontStyle());
|
||||
|
||||
if (textBlock == null) {
|
||||
textBlock = new TextPageBlock(wordBlock.getMinXDirAdj(),
|
||||
wordBlock.getMaxXDirAdj(),
|
||||
wordBlock.getMinYDirAdj(),
|
||||
wordBlock.getMaxYDirAdj(),
|
||||
wordBlockList,
|
||||
wordBlock.getRotation());
|
||||
} else {
|
||||
TextPageBlock spatialEntity = textBlock.union(wordBlock);
|
||||
textBlock.resize(spatialEntity.getMinX(), spatialEntity.getMinY(), spatialEntity.getWidth(), spatialEntity.getHeight());
|
||||
}
|
||||
}
|
||||
|
||||
if (textBlock != null) {
|
||||
textBlock.setMostPopularWordFont(fontFrequencyCounter.getMostPopular());
|
||||
textBlock.setMostPopularWordStyle(styleFrequencyCounter.getMostPopular());
|
||||
textBlock.setMostPopularWordFontSize(fontSizeFrequencyCounter.getMostPopular());
|
||||
textBlock.setMostPopularWordHeight(lineHeightFrequencyCounter.getMostPopular());
|
||||
textBlock.setMostPopularWordSpaceWidth(spaceFrequencyCounter.getMostPopular());
|
||||
textBlock.setHighestFontSize(fontSizeFrequencyCounter.getHighest());
|
||||
}
|
||||
|
||||
if (textBlock != null && textBlock.getSequences() != null && textBlock.getSequences().stream().map(t -> round(t.getMinYDirAdj(), 3)).collect(toSet()).size() == 1) {
|
||||
textBlock.getSequences().sort(Comparator.comparing(TextPositionSequence::getMinXDirAdj));
|
||||
}
|
||||
return textBlock;
|
||||
}
|
||||
|
||||
|
||||
private boolean isSplitByRuling(float minX,
|
||||
float minY,
|
||||
float maxX,
|
||||
float maxY,
|
||||
TextPositionSequence word,
|
||||
List<Ruling> horizontalRulingLines,
|
||||
List<Ruling> verticalRulingLines) {
|
||||
|
||||
return isSplitByRuling(maxX,
|
||||
minY,
|
||||
word.getMinXDirAdj(),
|
||||
word.getMinYDirAdj(),
|
||||
verticalRulingLines,
|
||||
word.getDir().getDegrees(),
|
||||
word.getPageWidth(),
|
||||
word.getPageHeight()) //
|
||||
|| isSplitByRuling(minX,
|
||||
minY,
|
||||
word.getMinXDirAdj(),
|
||||
word.getMaxYDirAdj(),
|
||||
horizontalRulingLines,
|
||||
word.getDir().getDegrees(),
|
||||
word.getPageWidth(),
|
||||
word.getPageHeight()) //
|
||||
|| isSplitByRuling(maxX,
|
||||
minY,
|
||||
word.getMinXDirAdj(),
|
||||
word.getMinYDirAdj(),
|
||||
horizontalRulingLines,
|
||||
word.getDir().getDegrees(),
|
||||
word.getPageWidth(),
|
||||
word.getPageHeight()) //
|
||||
|| isSplitByRuling(minX,
|
||||
minY,
|
||||
word.getMinXDirAdj(),
|
||||
word.getMaxYDirAdj(),
|
||||
verticalRulingLines,
|
||||
word.getDir().getDegrees(),
|
||||
word.getPageWidth(),
|
||||
word.getPageHeight()); //
|
||||
}
|
||||
|
||||
|
||||
private boolean isSplitByRuling(float previousX2, float previousY1, float currentX1, float currentY1, List<Ruling> rulingLines, float dir, float pageWidth, float pageHeight) {
|
||||
|
||||
for (Ruling ruling : rulingLines) {
|
||||
var line = RulingTextDirAdjustUtil.convertToDirAdj(ruling, dir, pageWidth, pageHeight);
|
||||
if (line.intersectsLine(previousX2, previousY1, currentX1, currentY1)) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
private double round(float value, int decimalPoints) {
|
||||
|
||||
var d = Math.pow(10, decimalPoints);
|
||||
return Math.round(value * d) / d;
|
||||
}
|
||||
|
||||
}
|
||||
@ -1,4 +1,4 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.classification.service;
|
||||
package com.knecon.fforesight.service.layoutparser.processor.services.blockification;
|
||||
|
||||
import static java.util.stream.Collectors.toSet;
|
||||
|
||||
@ -9,19 +9,19 @@ import java.util.List;
|
||||
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.model.AbstractPageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.model.ClassificationPage;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.model.FloatFrequencyCounter;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.model.Orientation;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.model.table.Ruling;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.model.text.StringFrequencyCounter;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.model.text.TextPageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.model.text.TextPositionSequence;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.classification.utils.RulingTextDirAdjustUtil;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.FloatFrequencyCounter;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.Orientation;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.StringFrequencyCounter;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.RulingTextDirAdjustUtil;
|
||||
|
||||
@Service
|
||||
@SuppressWarnings("all")
|
||||
public class BlockificationService {
|
||||
@Service
|
||||
public class RedactManagerBlockificationService {
|
||||
|
||||
static final float THRESHOLD = 1f;
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Loading…
x
Reference in New Issue
Block a user