Compare commits
13 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
aa1e77dda3 | ||
|
|
0c3b910088 | ||
|
|
b4e5f2da2f | ||
|
|
b47b187c8a | ||
|
|
0d4800622d | ||
|
|
cc492bc50d | ||
|
|
1c3c5385e1 | ||
|
|
bce2558133 | ||
|
|
2200574b6d | ||
|
|
775c943f7e | ||
|
|
b7fe6fd3c4 | ||
|
|
3dd447ebef | ||
|
|
f4e93ef03b |
@ -20,6 +20,9 @@ public record LayoutParsingRequest(
|
|||||||
@NonNull String originFileStorageId,//
|
@NonNull String originFileStorageId,//
|
||||||
@Schema(description = "Optional Path to the table extraction file.")//
|
@Schema(description = "Optional Path to the table extraction file.")//
|
||||||
Optional<String> tablesFileStorageId,//
|
Optional<String> tablesFileStorageId,//
|
||||||
|
|
||||||
|
@Schema(description= "Optional Path to the the table parsing service file")
|
||||||
|
Optional<String> tableExtractorFileId,
|
||||||
@Schema(description = "Optional Path to the image classification file.")//
|
@Schema(description = "Optional Path to the image classification file.")//
|
||||||
Optional<String> imagesFileStorageId,//
|
Optional<String> imagesFileStorageId,//
|
||||||
|
|
||||||
|
|||||||
@ -30,8 +30,11 @@ import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageB
|
|||||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
|
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.python_api.adapter.CvTableParsingAdapter;
|
import com.knecon.fforesight.service.layoutparser.processor.python_api.adapter.CvTableParsingAdapter;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.python_api.adapter.ImageServiceResponseAdapter;
|
import com.knecon.fforesight.service.layoutparser.processor.python_api.adapter.ImageServiceResponseAdapter;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.python_api.adapter.TableExtractorResponseAdapter;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.image.ImageServiceResponse;
|
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.image.ImageServiceResponse;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableCells;
|
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableCells;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableExtractorCells;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableExtractorResponse;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableServiceResponse;
|
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableServiceResponse;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.services.BodyTextFrameService;
|
import com.knecon.fforesight.service.layoutparser.processor.services.BodyTextFrameService;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.services.RulingCleaningService;
|
import com.knecon.fforesight.service.layoutparser.processor.services.RulingCleaningService;
|
||||||
@ -63,6 +66,7 @@ public class LayoutParsingPipeline {
|
|||||||
private final ImageServiceResponseAdapter imageServiceResponseAdapter;
|
private final ImageServiceResponseAdapter imageServiceResponseAdapter;
|
||||||
private final CvTableParsingAdapter cvTableParsingAdapter;
|
private final CvTableParsingAdapter cvTableParsingAdapter;
|
||||||
private final LayoutParsingStorageService layoutParsingStorageService;
|
private final LayoutParsingStorageService layoutParsingStorageService;
|
||||||
|
private final TableExtractorResponseAdapter tableExtractorResponseAdapter;
|
||||||
private final SectionsBuilderService sectionsBuilderService;
|
private final SectionsBuilderService sectionsBuilderService;
|
||||||
private final TaasClassificationService taasClassificationService;
|
private final TaasClassificationService taasClassificationService;
|
||||||
private final RedactManagerClassificationService redactManagerClassificationService;
|
private final RedactManagerClassificationService redactManagerClassificationService;
|
||||||
@ -87,12 +91,21 @@ public class LayoutParsingPipeline {
|
|||||||
imageServiceResponse = layoutParsingStorageService.getImagesFile(layoutParsingRequest.imagesFileStorageId().get());
|
imageServiceResponse = layoutParsingStorageService.getImagesFile(layoutParsingRequest.imagesFileStorageId().get());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
TableExtractorResponse tableExtractorResponse = new TableExtractorResponse();
|
||||||
|
if (layoutParsingRequest.tableExtractorFileId().isPresent()) {
|
||||||
|
tableExtractorResponse = layoutParsingStorageService.getExtractedTableFile(layoutParsingRequest.tableExtractorFileId().get());
|
||||||
|
}
|
||||||
|
|
||||||
TableServiceResponse tableServiceResponse = new TableServiceResponse();
|
TableServiceResponse tableServiceResponse = new TableServiceResponse();
|
||||||
if (layoutParsingRequest.tablesFileStorageId().isPresent()) {
|
if (layoutParsingRequest.tablesFileStorageId().isPresent()) {
|
||||||
tableServiceResponse = layoutParsingStorageService.getTablesFile(layoutParsingRequest.tablesFileStorageId().get());
|
tableServiceResponse = layoutParsingStorageService.getTablesFile(layoutParsingRequest.tablesFileStorageId().get());
|
||||||
}
|
}
|
||||||
|
|
||||||
ClassificationDocument classificationDocument = parseLayout(layoutParsingRequest.layoutParsingType(), originDocument, imageServiceResponse, tableServiceResponse);
|
ClassificationDocument classificationDocument = parseLayout(layoutParsingRequest.layoutParsingType(),
|
||||||
|
originDocument,
|
||||||
|
imageServiceResponse,
|
||||||
|
tableServiceResponse,
|
||||||
|
tableExtractorResponse);
|
||||||
Document documentGraph = DocumentGraphFactory.buildDocumentGraph(classificationDocument);
|
Document documentGraph = DocumentGraphFactory.buildDocumentGraph(classificationDocument);
|
||||||
|
|
||||||
int numberOfPages = originDocument.getNumberOfPages();
|
int numberOfPages = originDocument.getNumberOfPages();
|
||||||
@ -100,8 +113,9 @@ public class LayoutParsingPipeline {
|
|||||||
layoutParsingStorageService.storeDocumentData(layoutParsingRequest, DocumentDataMapper.toDocumentData(documentGraph));
|
layoutParsingStorageService.storeDocumentData(layoutParsingRequest, DocumentDataMapper.toDocumentData(documentGraph));
|
||||||
layoutParsingStorageService.storeSimplifiedText(layoutParsingRequest, simplifiedSectionTextService.toSimplifiedText(documentGraph));
|
layoutParsingStorageService.storeSimplifiedText(layoutParsingRequest, simplifiedSectionTextService.toSimplifiedText(documentGraph));
|
||||||
|
|
||||||
|
Map<Integer, List<TableExtractorCells>> extractedTableCells = tableExtractorResponseAdapter.buildExtractedTablesPerPage(tableExtractorResponse);
|
||||||
try (var out = new ByteArrayOutputStream()) {
|
try (var out = new ByteArrayOutputStream()) {
|
||||||
viewerDocumentService.createViewerDocument(originDocument, documentGraph, out, false);
|
viewerDocumentService.createViewerDocument(originDocument, documentGraph, out,extractedTableCells ,false);
|
||||||
layoutParsingStorageService.storeViewerDocument(layoutParsingRequest, out);
|
layoutParsingStorageService.storeViewerDocument(layoutParsingRequest, out);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -157,11 +171,12 @@ public class LayoutParsingPipeline {
|
|||||||
public ClassificationDocument parseLayout(LayoutParsingType layoutParsingType,
|
public ClassificationDocument parseLayout(LayoutParsingType layoutParsingType,
|
||||||
PDDocument originDocument,
|
PDDocument originDocument,
|
||||||
ImageServiceResponse imageServiceResponse,
|
ImageServiceResponse imageServiceResponse,
|
||||||
TableServiceResponse tableServiceResponse) {
|
TableServiceResponse tableServiceResponse,
|
||||||
|
TableExtractorResponse tableExtractorResponse) {
|
||||||
|
|
||||||
Map<Integer, List<TableCells>> pdfTableCells = cvTableParsingAdapter.buildCvParsedTablesPerPage(tableServiceResponse);
|
Map<Integer, List<TableCells>> pdfTableCells = cvTableParsingAdapter.buildCvParsedTablesPerPage(tableServiceResponse);
|
||||||
Map<Integer, List<ClassifiedImage>> pdfImages = imageServiceResponseAdapter.buildClassifiedImagesPerPage(imageServiceResponse);
|
Map<Integer, List<ClassifiedImage>> pdfImages = imageServiceResponseAdapter.buildClassifiedImagesPerPage(imageServiceResponse);
|
||||||
|
Map<Integer, List<TableExtractorCells>> extractedTableCells = tableExtractorResponseAdapter.buildExtractedTablesPerPage(tableExtractorResponse);
|
||||||
ClassificationDocument classificationDocument = new ClassificationDocument();
|
ClassificationDocument classificationDocument = new ClassificationDocument();
|
||||||
List<ClassificationPage> classificationPages = new ArrayList<>();
|
List<ClassificationPage> classificationPages = new ArrayList<>();
|
||||||
|
|
||||||
@ -244,9 +259,9 @@ public class LayoutParsingPipeline {
|
|||||||
|
|
||||||
private void increaseDocumentStatistics(ClassificationPage classificationPage, ClassificationDocument document) {
|
private void increaseDocumentStatistics(ClassificationPage classificationPage, ClassificationDocument document) {
|
||||||
|
|
||||||
if (!classificationPage.isLandscape()) {
|
if (!classificationPage.isLandscape()) {
|
||||||
document.getFontSizeCounter().addAll(classificationPage.getFontSizeCounter().getCountPerValue());
|
document.getFontSizeCounter().addAll(classificationPage.getFontSizeCounter().getCountPerValue());
|
||||||
}
|
}
|
||||||
document.getFontCounter().addAll(classificationPage.getFontCounter().getCountPerValue());
|
document.getFontCounter().addAll(classificationPage.getFontCounter().getCountPerValue());
|
||||||
document.getTextHeightCounter().addAll(classificationPage.getTextHeightCounter().getCountPerValue());
|
document.getTextHeightCounter().addAll(classificationPage.getTextHeightCounter().getCountPerValue());
|
||||||
document.getFontStyleCounter().addAll(classificationPage.getFontStyleCounter().getCountPerValue());
|
document.getFontStyleCounter().addAll(classificationPage.getFontStyleCounter().getCountPerValue());
|
||||||
|
|||||||
@ -23,6 +23,7 @@ import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.Si
|
|||||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.taas.ResearchDocumentData;
|
import com.knecon.fforesight.service.layoutparser.internal.api.data.taas.ResearchDocumentData;
|
||||||
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingRequest;
|
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingRequest;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.image.ImageServiceResponse;
|
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.image.ImageServiceResponse;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableExtractorResponse;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableServiceResponse;
|
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableServiceResponse;
|
||||||
import com.knecon.fforesight.tenantcommons.TenantContext;
|
import com.knecon.fforesight.tenantcommons.TenantContext;
|
||||||
|
|
||||||
@ -63,6 +64,16 @@ public class LayoutParsingStorageService {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public TableExtractorResponse getExtractedTableFile(String storageId) throws IOException {
|
||||||
|
|
||||||
|
try (InputStream inputStream = getObject(storageId)) {
|
||||||
|
TableExtractorResponse tableExtractorResponse = objectMapper.readValue(inputStream, TableExtractorResponse.class);
|
||||||
|
inputStream.close();
|
||||||
|
return tableExtractorResponse;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
public TableServiceResponse getTablesFile(String storageId) throws IOException {
|
public TableServiceResponse getTablesFile(String storageId) throws IOException {
|
||||||
|
|
||||||
try (var tableClassificationStream = getObject(storageId)) {
|
try (var tableClassificationStream = getObject(storageId)) {
|
||||||
@ -83,7 +94,6 @@ public class LayoutParsingStorageService {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
public void storeResearchDocumentData(LayoutParsingRequest layoutParsingRequest, ResearchDocumentData researchDocumentData) {
|
public void storeResearchDocumentData(LayoutParsingRequest layoutParsingRequest, ResearchDocumentData researchDocumentData) {
|
||||||
|
|
||||||
storageService.storeJSONObject(TenantContext.getTenantId(), layoutParsingRequest.researchDocumentStorageId(), researchDocumentData);
|
storageService.storeJSONObject(TenantContext.getTenantId(), layoutParsingRequest.researchDocumentStorageId(), researchDocumentData);
|
||||||
|
|||||||
@ -14,7 +14,6 @@ import com.knecon.fforesight.service.layoutparser.processor.model.text.StringFre
|
|||||||
import lombok.Data;
|
import lombok.Data;
|
||||||
import lombok.NonNull;
|
import lombok.NonNull;
|
||||||
import lombok.RequiredArgsConstructor;
|
import lombok.RequiredArgsConstructor;
|
||||||
import org.apache.pdfbox.pdmodel.documentinterchange.markedcontent.PDMarkedContent;
|
|
||||||
|
|
||||||
@Data
|
@Data
|
||||||
@RequiredArgsConstructor
|
@RequiredArgsConstructor
|
||||||
|
|||||||
@ -19,4 +19,5 @@ public class PageContents {
|
|||||||
Rectangle2D cropBox;
|
Rectangle2D cropBox;
|
||||||
Rectangle2D mediaBox;
|
Rectangle2D mediaBox;
|
||||||
List<Ruling> rulings;
|
List<Ruling> rulings;
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|||||||
@ -108,11 +108,13 @@ public class Boundary implements Comparable<Boundary> {
|
|||||||
return splitBoundaries;
|
return splitBoundaries;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
public IntStream intStream() {
|
public IntStream intStream() {
|
||||||
|
|
||||||
return IntStream.range(start, end);
|
return IntStream.range(start, end);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
public static Boundary merge(Collection<Boundary> boundaries) {
|
public static Boundary merge(Collection<Boundary> boundaries) {
|
||||||
|
|
||||||
int minStart = boundaries.stream().mapToInt(Boundary::start).min().orElseThrow(IllegalArgumentException::new);
|
int minStart = boundaries.stream().mapToInt(Boundary::start).min().orElseThrow(IllegalArgumentException::new);
|
||||||
|
|||||||
@ -105,6 +105,7 @@ public class Document implements GenericSemanticNode {
|
|||||||
return streamAllSubNodes().collect(Collectors.groupingBy(SemanticNode::getType, Collectors.counting()));
|
return streamAllSubNodes().collect(Collectors.groupingBy(SemanticNode::getType, Collectors.counting()));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public String toString() {
|
public String toString() {
|
||||||
|
|
||||||
|
|||||||
@ -207,6 +207,7 @@ public class Table implements SemanticNode {
|
|||||||
return IntStream.range(0, numberOfCols).boxed().map(col -> getCell(row, col));
|
return IntStream.range(0, numberOfCols).boxed().map(col -> getCell(row, col));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Streams all TableCells row-wise and filters them with header == true.
|
* Streams all TableCells row-wise and filters them with header == true.
|
||||||
*
|
*
|
||||||
|
|||||||
@ -109,10 +109,7 @@ public class AtomicTextBlock implements TextBlock {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
public static AtomicTextBlock fromAtomicTextBlockData(DocumentTextData documentTextData,
|
public static AtomicTextBlock fromAtomicTextBlockData(DocumentTextData documentTextData, DocumentPositionData documentPositionData, SemanticNode parent, Page page) {
|
||||||
DocumentPositionData documentPositionData,
|
|
||||||
SemanticNode parent,
|
|
||||||
Page page) {
|
|
||||||
|
|
||||||
return AtomicTextBlock.builder()
|
return AtomicTextBlock.builder()
|
||||||
.id(documentTextData.getId())
|
.id(documentTextData.getId())
|
||||||
|
|||||||
@ -1,14 +1,12 @@
|
|||||||
package com.knecon.fforesight.service.layoutparser.processor.model.table;
|
package com.knecon.fforesight.service.layoutparser.processor.model.table;
|
||||||
|
|
||||||
import java.awt.geom.Point2D;
|
import java.awt.geom.Point2D;
|
||||||
import java.awt.geom.Rectangle2D;
|
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
import java.util.Collections;
|
import java.util.Collections;
|
||||||
import java.util.HashSet;
|
import java.util.HashSet;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.Set;
|
import java.util.Set;
|
||||||
import java.util.TreeMap;
|
import java.util.TreeMap;
|
||||||
import java.util.stream.Collectors;
|
|
||||||
|
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
|
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.PageBlockType;
|
import com.knecon.fforesight.service.layoutparser.processor.model.PageBlockType;
|
||||||
@ -50,6 +48,7 @@ public class TablePageBlock extends AbstractPageBlock {
|
|||||||
return getColCount() == 0 || getRowCount() == 0;
|
return getColCount() == 0 || getRowCount() == 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
public List<List<Cell>> getRows() {
|
public List<List<Cell>> getRows() {
|
||||||
|
|
||||||
if (rows == null) {
|
if (rows == null) {
|
||||||
@ -276,21 +275,17 @@ public class TablePageBlock extends AbstractPageBlock {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
public boolean intersects(Cell cell1, Cell cell2) {
|
public boolean intersects(Cell cell1, Cell cell2) {
|
||||||
|
|
||||||
if (cell1.getHeight() <= 0 || cell2.getHeight() <= 0) {
|
if (cell1.getHeight() <= 0 || cell2.getHeight() <= 0) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
double x0 = cell1.getX() + 2;
|
double x0 = cell1.getX() + 2;
|
||||||
double y0 = cell1.getY() + 2;
|
double y0 = cell1.getY() + 2;
|
||||||
return (cell2.x + cell2.width > x0 &&
|
return (cell2.x + cell2.width > x0 && cell2.y + cell2.height > y0 && cell2.x < x0 + cell1.getWidth() - 2 && cell2.y < y0 + cell1.getHeight() - 2);
|
||||||
cell2.y + cell2.height > y0 &&
|
|
||||||
cell2.x < x0 + cell1.getWidth() -2 &&
|
|
||||||
cell2.y < y0 + cell1.getHeight() -2);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public String getText() {
|
public String getText() {
|
||||||
|
|
||||||
@ -328,8 +323,6 @@ public class TablePageBlock extends AbstractPageBlock {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
public String getTextAsHtml() {
|
public String getTextAsHtml() {
|
||||||
|
|
||||||
StringBuilder sb = new StringBuilder();
|
StringBuilder sb = new StringBuilder();
|
||||||
|
|||||||
@ -2,6 +2,7 @@ package com.knecon.fforesight.service.layoutparser.processor.model.text;
|
|||||||
|
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.utils.TextNormalizationUtilities;
|
import com.knecon.fforesight.service.layoutparser.processor.utils.TextNormalizationUtilities;
|
||||||
|
|
||||||
import lombok.Getter;
|
import lombok.Getter;
|
||||||
|
|||||||
@ -73,7 +73,7 @@ public class TextPageBlock extends AbstractPageBlock {
|
|||||||
|
|
||||||
return sequences.get(0).getPageWidth();
|
return sequences.get(0).getPageWidth();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
public static TextPageBlock merge(List<TextPageBlock> textBlocksToMerge) {
|
public static TextPageBlock merge(List<TextPageBlock> textBlocksToMerge) {
|
||||||
|
|
||||||
@ -82,6 +82,7 @@ public class TextPageBlock extends AbstractPageBlock {
|
|||||||
return fromTextPositionSequences(sequences);
|
return fromTextPositionSequences(sequences);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
public static TextPageBlock fromTextPositionSequences(List<TextPositionSequence> wordBlockList) {
|
public static TextPageBlock fromTextPositionSequences(List<TextPositionSequence> wordBlockList) {
|
||||||
|
|
||||||
TextPageBlock textBlock = null;
|
TextPageBlock textBlock = null;
|
||||||
@ -133,7 +134,6 @@ public class TextPageBlock extends AbstractPageBlock {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Returns the minX value in pdf coordinate system.
|
* Returns the minX value in pdf coordinate system.
|
||||||
* Note: This needs to use Pdf Coordinate System where {0,0} rotated with the page rotation.
|
* Note: This needs to use Pdf Coordinate System where {0,0} rotated with the page rotation.
|
||||||
|
|||||||
@ -234,6 +234,7 @@ public class TextPositionSequence implements CharSequence {
|
|||||||
@JsonIgnore
|
@JsonIgnore
|
||||||
@JsonAttribute(ignore = true)
|
@JsonAttribute(ignore = true)
|
||||||
public String getFontStyle() {
|
public String getFontStyle() {
|
||||||
|
|
||||||
if (textPositions.get(0).getFontName() == null) {
|
if (textPositions.get(0).getFontName() == null) {
|
||||||
return "standard";
|
return "standard";
|
||||||
}
|
}
|
||||||
|
|||||||
@ -9,10 +9,10 @@ import java.util.Map;
|
|||||||
|
|
||||||
import org.springframework.stereotype.Service;
|
import org.springframework.stereotype.Service;
|
||||||
|
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.image.ImageServiceResponse;
|
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
|
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.image.ClassifiedImage;
|
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.ImageType;
|
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.ImageType;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.model.image.ClassifiedImage;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.image.ImageServiceResponse;
|
||||||
|
|
||||||
import lombok.RequiredArgsConstructor;
|
import lombok.RequiredArgsConstructor;
|
||||||
|
|
||||||
@ -20,8 +20,7 @@ import lombok.RequiredArgsConstructor;
|
|||||||
@RequiredArgsConstructor
|
@RequiredArgsConstructor
|
||||||
public class ImageServiceResponseAdapter {
|
public class ImageServiceResponseAdapter {
|
||||||
|
|
||||||
|
public Map<Integer, List<ClassifiedImage>> buildClassifiedImagesPerPage(ImageServiceResponse imageServiceResponse) {
|
||||||
public Map<Integer, List<ClassifiedImage>> buildClassifiedImagesPerPage(ImageServiceResponse imageServiceResponse ) {
|
|
||||||
|
|
||||||
Map<Integer, List<ClassifiedImage>> images = new HashMap<>();
|
Map<Integer, List<ClassifiedImage>> images = new HashMap<>();
|
||||||
imageServiceResponse.getData().forEach(imageMetadata -> {
|
imageServiceResponse.getData().forEach(imageMetadata -> {
|
||||||
|
|||||||
@ -0,0 +1,74 @@
|
|||||||
|
package com.knecon.fforesight.service.layoutparser.processor.python_api.adapter;
|
||||||
|
|
||||||
|
import java.awt.geom.Rectangle2D;
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.HashMap;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.Locale;
|
||||||
|
import java.util.Map;
|
||||||
|
|
||||||
|
import org.springframework.stereotype.Service;
|
||||||
|
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.ImageType;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.model.image.ClassifiedImage;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.image.ImageServiceResponse;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.ExtractedTable;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.ExtractedTableData;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableCells;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableData;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableExtractorCells;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableExtractorResponse;
|
||||||
|
|
||||||
|
import lombok.RequiredArgsConstructor;
|
||||||
|
import lombok.extern.slf4j.Slf4j;
|
||||||
|
|
||||||
|
@Service
|
||||||
|
@Slf4j
|
||||||
|
@RequiredArgsConstructor
|
||||||
|
public class TableExtractorResponseAdapter {
|
||||||
|
|
||||||
|
public Map<Integer, List<TableExtractorCells>> buildExtractedTablesPerPage(TableExtractorResponse tableExtractorResponse) {
|
||||||
|
Map<Integer, List<TableExtractorCells>> tableCells = new HashMap<>();
|
||||||
|
tableExtractorResponse.getData()
|
||||||
|
.forEach(tableData -> tableCells.computeIfAbsent(tableData.getPage_number(), tableCell -> new ArrayList<>())
|
||||||
|
.addAll(convertTableCells(tableData.getTables())));
|
||||||
|
|
||||||
|
return tableCells;
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
public List<TableExtractorCells> convertTableCells(List<ExtractedTable> tableObjects) {
|
||||||
|
|
||||||
|
List<TableExtractorCells> parsedTableCells = new ArrayList<>();
|
||||||
|
|
||||||
|
tableObjects.stream().forEach(t -> {
|
||||||
|
TableExtractorCells tableCells = new TableExtractorCells();
|
||||||
|
tableCells.setX0(t.getTable().getBbox().get(0));
|
||||||
|
tableCells.setX1(t.getTable().getBbox().get(2));
|
||||||
|
tableCells.setY0(t.getTable().getBbox().get(1));
|
||||||
|
tableCells.setY1(t.getTable().getBbox().get(3));
|
||||||
|
tableCells.setWidth(tableCells.getX1()- tableCells.getX0());
|
||||||
|
tableCells.setHeight(tableCells.getY1()- tableCells.getY0());
|
||||||
|
tableCells.setLabel(t.getTable().getLabel());
|
||||||
|
log.info("Parsed table cell {} with label {}",tableCells, tableCells.getLabel());
|
||||||
|
parsedTableCells.add(tableCells);
|
||||||
|
t.getObjects().forEach(o -> {
|
||||||
|
TableExtractorCells objectCell = new TableExtractorCells();
|
||||||
|
objectCell.setX0(t.getTable().getBbox().get(0));
|
||||||
|
objectCell.setX1(t.getTable().getBbox().get(2));
|
||||||
|
objectCell.setY0(t.getTable().getBbox().get(1));
|
||||||
|
objectCell.setY1(t.getTable().getBbox().get(3));
|
||||||
|
objectCell.setWidth(objectCell.getX1()- objectCell.getX0());
|
||||||
|
objectCell.setHeight(objectCell.getY1()- objectCell.getY0());
|
||||||
|
objectCell.setLabel(o.getLabel());
|
||||||
|
log.info("Parsed object cell {} with label {}",objectCell, objectCell.getLabel());
|
||||||
|
parsedTableCells.add(objectCell);
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
|
return parsedTableCells;
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
}
|
||||||
@ -0,0 +1,20 @@
|
|||||||
|
package com.knecon.fforesight.service.layoutparser.processor.python_api.model.table;
|
||||||
|
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
import lombok.AllArgsConstructor;
|
||||||
|
import lombok.Builder;
|
||||||
|
import lombok.Data;
|
||||||
|
import lombok.NoArgsConstructor;
|
||||||
|
|
||||||
|
@Data
|
||||||
|
@Builder
|
||||||
|
@NoArgsConstructor
|
||||||
|
@AllArgsConstructor
|
||||||
|
public class ExtractedTable {
|
||||||
|
|
||||||
|
private boolean rotated;
|
||||||
|
private ExtractedTableData table;
|
||||||
|
private List<ExtractedTableData> objects;
|
||||||
|
|
||||||
|
}
|
||||||
@ -0,0 +1,20 @@
|
|||||||
|
package com.knecon.fforesight.service.layoutparser.processor.python_api.model.table;
|
||||||
|
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
import lombok.AllArgsConstructor;
|
||||||
|
import lombok.Builder;
|
||||||
|
import lombok.Data;
|
||||||
|
import lombok.NoArgsConstructor;
|
||||||
|
|
||||||
|
@Data
|
||||||
|
@Builder
|
||||||
|
@NoArgsConstructor
|
||||||
|
@AllArgsConstructor
|
||||||
|
public class ExtractedTableData {
|
||||||
|
|
||||||
|
private String label;
|
||||||
|
private float score;
|
||||||
|
private List<Float> bbox;
|
||||||
|
|
||||||
|
}
|
||||||
@ -0,0 +1,21 @@
|
|||||||
|
package com.knecon.fforesight.service.layoutparser.processor.python_api.model.table;
|
||||||
|
|
||||||
|
import lombok.AllArgsConstructor;
|
||||||
|
import lombok.Builder;
|
||||||
|
import lombok.Data;
|
||||||
|
import lombok.NoArgsConstructor;
|
||||||
|
|
||||||
|
@Data
|
||||||
|
@Builder
|
||||||
|
@NoArgsConstructor
|
||||||
|
@AllArgsConstructor
|
||||||
|
public class TableExtractorCells {
|
||||||
|
private float x0;
|
||||||
|
private float y0;
|
||||||
|
private float x1;
|
||||||
|
private float y1;
|
||||||
|
private float width;
|
||||||
|
private float height;
|
||||||
|
private String label;
|
||||||
|
|
||||||
|
}
|
||||||
@ -0,0 +1,19 @@
|
|||||||
|
package com.knecon.fforesight.service.layoutparser.processor.python_api.model.table;
|
||||||
|
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
import lombok.AllArgsConstructor;
|
||||||
|
import lombok.Builder;
|
||||||
|
import lombok.Data;
|
||||||
|
import lombok.NoArgsConstructor;
|
||||||
|
|
||||||
|
@Data
|
||||||
|
@Builder
|
||||||
|
@NoArgsConstructor
|
||||||
|
@AllArgsConstructor
|
||||||
|
public class TableExtractorData {
|
||||||
|
|
||||||
|
private int page_number;
|
||||||
|
private int image;
|
||||||
|
private List<ExtractedTable> tables;
|
||||||
|
}
|
||||||
@ -0,0 +1,23 @@
|
|||||||
|
package com.knecon.fforesight.service.layoutparser.processor.python_api.model.table;
|
||||||
|
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
import lombok.AllArgsConstructor;
|
||||||
|
import lombok.Builder;
|
||||||
|
import lombok.Data;
|
||||||
|
import lombok.NoArgsConstructor;
|
||||||
|
|
||||||
|
@Data
|
||||||
|
@Builder
|
||||||
|
@NoArgsConstructor
|
||||||
|
@AllArgsConstructor
|
||||||
|
public class TableExtractorResponse {
|
||||||
|
|
||||||
|
private String dossierId;
|
||||||
|
private String fileId;
|
||||||
|
private String targetFileExtension;
|
||||||
|
private String responseFileExtension;
|
||||||
|
private String X_TENANT_ID;
|
||||||
|
private List<TableExtractorData> data;
|
||||||
|
|
||||||
|
}
|
||||||
@ -25,6 +25,7 @@ public class BodyTextFrameService {
|
|||||||
private static final float RULING_HEIGHT_THRESHOLD = 0.15f; // multiplied with page height. Header/Footer Rulings must be within that border of the page.
|
private static final float RULING_HEIGHT_THRESHOLD = 0.15f; // multiplied with page height. Header/Footer Rulings must be within that border of the page.
|
||||||
private static final float RULING_WIDTH_THRESHOLD = 0.75f; // multiplied with page width. Header/Footer Rulings must be at least that wide.
|
private static final float RULING_WIDTH_THRESHOLD = 0.75f; // multiplied with page width. Header/Footer Rulings must be at least that wide.
|
||||||
|
|
||||||
|
|
||||||
public void setBodyTextFrames(ClassificationDocument classificationDocument, LayoutParsingType layoutParsingType) {
|
public void setBodyTextFrames(ClassificationDocument classificationDocument, LayoutParsingType layoutParsingType) {
|
||||||
|
|
||||||
Rectangle bodyTextFrame = calculateBodyTextFrame(classificationDocument.getPages(), classificationDocument.getFontSizeCounter(), false, layoutParsingType);
|
Rectangle bodyTextFrame = calculateBodyTextFrame(classificationDocument.getPages(), classificationDocument.getFontSizeCounter(), false, layoutParsingType);
|
||||||
@ -155,8 +156,9 @@ public class BodyTextFrameService {
|
|||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (MarkedContentUtils.intersects(textBlock, page.getMarkedContentBboxPerType(), MarkedContentUtils.HEADER)
|
if (MarkedContentUtils.intersects(textBlock, page.getMarkedContentBboxPerType(), MarkedContentUtils.HEADER) || MarkedContentUtils.intersects(textBlock,
|
||||||
|| MarkedContentUtils.intersects(textBlock, page.getMarkedContentBboxPerType(), MarkedContentUtils.FOOTER)) {
|
page.getMarkedContentBboxPerType(),
|
||||||
|
MarkedContentUtils.FOOTER)) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@ -22,7 +22,6 @@ public class DividingColumnDetectionService {
|
|||||||
|
|
||||||
public List<Rectangle2D> detectColumns(PageContents pageContents) {
|
public List<Rectangle2D> detectColumns(PageContents pageContents) {
|
||||||
|
|
||||||
|
|
||||||
if (pageContents.getSortedTextPositionSequences().size() < 2) {
|
if (pageContents.getSortedTextPositionSequences().size() < 2) {
|
||||||
return List.of(pageContents.getCropBox());
|
return List.of(pageContents.getCropBox());
|
||||||
}
|
}
|
||||||
|
|||||||
@ -72,11 +72,13 @@ public class GapDetectionService {
|
|||||||
return mirrorY(RectangleTransformations.toRectangle2D(textPosition.getRectangle()));
|
return mirrorY(RectangleTransformations.toRectangle2D(textPosition.getRectangle()));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
private static Rectangle2D mirrorY(Rectangle2D rectangle2D) {
|
private static Rectangle2D mirrorY(Rectangle2D rectangle2D) {
|
||||||
|
|
||||||
return new Rectangle2D.Double(rectangle2D.getX(), Math.min(rectangle2D.getMinY(), rectangle2D.getMaxY()), rectangle2D.getWidth(), Math.abs(rectangle2D.getHeight()));
|
return new Rectangle2D.Double(rectangle2D.getX(), Math.min(rectangle2D.getMinY(), rectangle2D.getMaxY()), rectangle2D.getWidth(), Math.abs(rectangle2D.getHeight()));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
private static void addGapToLine(Rectangle2D currentTextPosition, Rectangle2D previousTextPosition, XGapsContext context) {
|
private static void addGapToLine(Rectangle2D currentTextPosition, Rectangle2D previousTextPosition, XGapsContext context) {
|
||||||
|
|
||||||
context.gapsInCurrentLine.add(new Rectangle2D.Double(previousTextPosition.getMaxX(),
|
context.gapsInCurrentLine.add(new Rectangle2D.Double(previousTextPosition.getMaxX(),
|
||||||
|
|||||||
@ -6,7 +6,6 @@ import java.util.LinkedList;
|
|||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.Queue;
|
import java.util.Queue;
|
||||||
import java.util.stream.Stream;
|
import java.util.stream.Stream;
|
||||||
import com.iqser.red.commons.jackson.ObjectMapperFactory;
|
|
||||||
|
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.GapInformation;
|
import com.knecon.fforesight.service.layoutparser.processor.model.GapInformation;
|
||||||
|
|
||||||
@ -51,7 +50,9 @@ public class GapsAcrossLinesService {
|
|||||||
}
|
}
|
||||||
|
|
||||||
return columnFactory.outputGaps.stream()
|
return columnFactory.outputGaps.stream()
|
||||||
.filter(gapAcrossLines -> columnFactory.outputGaps.stream().filter(gapAcrossLines::intersectsX).noneMatch(gapAcrossLines1 -> gapAcrossLines1.lineCount > gapAcrossLines.lineCount))
|
.filter(gapAcrossLines -> columnFactory.outputGaps.stream()
|
||||||
|
.filter(gapAcrossLines::intersectsX)
|
||||||
|
.noneMatch(gapAcrossLines1 -> gapAcrossLines1.lineCount > gapAcrossLines.lineCount))
|
||||||
.filter(gapAcrossLines -> Math.abs(gapAcrossLines.rectangle2D.getMinX() - mainBodyTextFrame.getMinX()) > DISTANCE_TO_BORDER_THRESHOLD)
|
.filter(gapAcrossLines -> Math.abs(gapAcrossLines.rectangle2D.getMinX() - mainBodyTextFrame.getMinX()) > DISTANCE_TO_BORDER_THRESHOLD)
|
||||||
.filter(gapAcrossLines -> Math.abs(gapAcrossLines.rectangle2D.getMaxX() - mainBodyTextFrame.getMaxX()) > DISTANCE_TO_BORDER_THRESHOLD)
|
.filter(gapAcrossLines -> Math.abs(gapAcrossLines.rectangle2D.getMaxX() - mainBodyTextFrame.getMaxX()) > DISTANCE_TO_BORDER_THRESHOLD)
|
||||||
.map(GapAcrossLines::getRectangle2D)
|
.map(GapAcrossLines::getRectangle2D)
|
||||||
|
|||||||
@ -6,8 +6,8 @@ import java.util.List;
|
|||||||
|
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.GapInformation;
|
import com.knecon.fforesight.service.layoutparser.processor.model.GapInformation;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.LineInformation;
|
import com.knecon.fforesight.service.layoutparser.processor.model.LineInformation;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations;
|
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
|
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations;
|
||||||
|
|
||||||
import lombok.AllArgsConstructor;
|
import lombok.AllArgsConstructor;
|
||||||
import lombok.Getter;
|
import lombok.Getter;
|
||||||
|
|||||||
@ -16,8 +16,7 @@ public class MainBodyTextFrameExtractionService {
|
|||||||
|
|
||||||
public Rectangle2D calculateMainBodyTextFrame(LineInformation lineInformation) {
|
public Rectangle2D calculateMainBodyTextFrame(LineInformation lineInformation) {
|
||||||
|
|
||||||
Rectangle2D mainBodyTextFrame = lineInformation.getLineBBox().stream()
|
Rectangle2D mainBodyTextFrame = lineInformation.getLineBBox().stream().collect(RectangleTransformations.collectBBox());
|
||||||
.collect(RectangleTransformations.collectBBox());
|
|
||||||
|
|
||||||
return RectangleTransformations.pad(mainBodyTextFrame, mainBodyTextFrame.getWidth() * TEXT_FRAME_PAD_WIDTH, mainBodyTextFrame.getHeight() * TEXT_FRAME_PAD_HEIGHT);
|
return RectangleTransformations.pad(mainBodyTextFrame, mainBodyTextFrame.getWidth() * TEXT_FRAME_PAD_WIDTH, mainBodyTextFrame.getHeight() * TEXT_FRAME_PAD_HEIGHT);
|
||||||
}
|
}
|
||||||
|
|||||||
@ -52,7 +52,7 @@ public class PageContentExtractor {
|
|||||||
stripper.getRulings()));
|
stripper.getRulings()));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return textPositionSequencesPerPage;
|
return textPositionSequencesPerPage;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@ -5,9 +5,9 @@ import java.util.List;
|
|||||||
import org.springframework.stereotype.Service;
|
import org.springframework.stereotype.Service;
|
||||||
|
|
||||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.SimplifiedSectionText;
|
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.SimplifiedSectionText;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.SimplifiedText;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document;
|
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Section;
|
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Section;
|
||||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.SimplifiedText;
|
|
||||||
|
|
||||||
@Service
|
@Service
|
||||||
public class SimplifiedSectionTextService {
|
public class SimplifiedSectionTextService {
|
||||||
@ -23,4 +23,5 @@ public class SimplifiedSectionTextService {
|
|||||||
|
|
||||||
return SimplifiedSectionText.builder().sectionNumber(section.getTreeId().get(0)).text(section.getTextBlock().getSearchText()).build();
|
return SimplifiedSectionText.builder().sectionNumber(section.getTreeId().get(0)).text(section.getTextBlock().getSearchText()).build();
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|||||||
@ -1,9 +1,20 @@
|
|||||||
package com.knecon.fforesight.service.layoutparser.processor.services.blockification;
|
package com.knecon.fforesight.service.layoutparser.processor.services.blockification;
|
||||||
|
|
||||||
|
|
||||||
// TODO: figure out, why this fails the build
|
// TODO: figure out, why this fails the build
|
||||||
// import static com.knecon.fforesight.service.layoutparser.processor.services.factory.SearchTextWithTextPositionFactory.HEIGHT_PADDING;
|
// import static com.knecon.fforesight.service.layoutparser.processor.services.factory.SearchTextWithTextPositionFactory.HEIGHT_PADDING;
|
||||||
|
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.HashSet;
|
||||||
|
import java.util.Iterator;
|
||||||
|
import java.util.LinkedList;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.Set;
|
||||||
|
import java.util.regex.Matcher;
|
||||||
|
import java.util.regex.Pattern;
|
||||||
|
import java.util.stream.Stream;
|
||||||
|
|
||||||
|
import org.springframework.stereotype.Service;
|
||||||
|
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
|
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
|
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.Orientation;
|
import com.knecon.fforesight.service.layoutparser.processor.model.Orientation;
|
||||||
@ -11,12 +22,6 @@ import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling;
|
|||||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
|
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
|
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.utils.RulingTextDirAdjustUtil;
|
import com.knecon.fforesight.service.layoutparser.processor.utils.RulingTextDirAdjustUtil;
|
||||||
import org.springframework.stereotype.Service;
|
|
||||||
|
|
||||||
import java.util.*;
|
|
||||||
import java.util.regex.Matcher;
|
|
||||||
import java.util.regex.Pattern;
|
|
||||||
import java.util.stream.Stream;
|
|
||||||
|
|
||||||
@Service
|
@Service
|
||||||
@SuppressWarnings("all")
|
@SuppressWarnings("all")
|
||||||
@ -83,13 +88,13 @@ public class TaasBlockificationService {
|
|||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
Matcher listIdentifierPattern = listIdentifier.matcher(currentTextBlock.getText());
|
Matcher listIdentifierPattern = listIdentifier.matcher(currentTextBlock.getText());
|
||||||
boolean isListIdentifier = listIdentifierPattern.find();
|
boolean isListIdentifier = listIdentifierPattern.find();
|
||||||
|
|
||||||
boolean yGap = Math.abs(currentTextBlock.getPdfMaxY() - previousTextBlock.getPdfMinY()) < previousTextBlock.getMostPopularWordHeight() * Y_GAP_SPLIT_HEIGHT_MODIFIER;
|
boolean yGap = Math.abs(currentTextBlock.getPdfMaxY() - previousTextBlock.getPdfMinY()) < previousTextBlock.getMostPopularWordHeight() * Y_GAP_SPLIT_HEIGHT_MODIFIER;
|
||||||
|
|
||||||
boolean sameFont = previousTextBlock.getMostPopularWordFont().equals(currentTextBlock.getMostPopularWordFont()) && previousTextBlock.getMostPopularWordFontSize() == currentTextBlock.getMostPopularWordFontSize();
|
boolean sameFont = previousTextBlock.getMostPopularWordFont()
|
||||||
|
.equals(currentTextBlock.getMostPopularWordFont()) && previousTextBlock.getMostPopularWordFontSize() == currentTextBlock.getMostPopularWordFontSize();
|
||||||
// boolean yGap = previousTextBlock != null && currentTextBlock.getMinYDirAdj() - maxY > Math.min(word.getHeight(), prev.getHeight()) * Y_GAP_SPLIT_HEIGHT_MODIFIER;
|
// boolean yGap = previousTextBlock != null && currentTextBlock.getMinYDirAdj() - maxY > Math.min(word.getHeight(), prev.getHeight()) * Y_GAP_SPLIT_HEIGHT_MODIFIER;
|
||||||
|
|
||||||
boolean alignsXRight = Math.abs(currentTextBlock.getPdfMaxX() - previousTextBlock.getPdfMaxX()) < X_ALIGNMENT_THRESHOLD;
|
boolean alignsXRight = Math.abs(currentTextBlock.getPdfMaxX() - previousTextBlock.getPdfMaxX()) < X_ALIGNMENT_THRESHOLD;
|
||||||
@ -119,8 +124,9 @@ public class TaasBlockificationService {
|
|||||||
}
|
}
|
||||||
alreadyMerged.add(textPageBlock);
|
alreadyMerged.add(textPageBlock);
|
||||||
textBlocksToMerge.add(Stream.concat(Stream.of(textPageBlock),
|
textBlocksToMerge.add(Stream.concat(Stream.of(textPageBlock),
|
||||||
textPageBlocks.stream().filter(textPageBlock2 -> textPageBlock.almostIntersects(textPageBlock2, INTERSECTS_Y_THRESHOLD, 0) && !alreadyMerged.contains(textPageBlock2)).peek(alreadyMerged::add))
|
textPageBlocks.stream()
|
||||||
.toList());
|
.filter(textPageBlock2 -> textPageBlock.almostIntersects(textPageBlock2, INTERSECTS_Y_THRESHOLD, 0) && !alreadyMerged.contains(textPageBlock2))
|
||||||
|
.peek(alreadyMerged::add)).toList());
|
||||||
}
|
}
|
||||||
return textBlocksToMerge.stream().map(TextPageBlock::merge).toList();
|
return textBlocksToMerge.stream().map(TextPageBlock::merge).toList();
|
||||||
}
|
}
|
||||||
@ -163,8 +169,7 @@ public class TaasBlockificationService {
|
|||||||
while (itty.hasNext()) {
|
while (itty.hasNext()) {
|
||||||
TextPageBlock block = (TextPageBlock) itty.next();
|
TextPageBlock block = (TextPageBlock) itty.next();
|
||||||
|
|
||||||
if (previous != null && previous.getOrientation().equals(Orientation.LEFT) && block.getOrientation().equals(Orientation.LEFT) && equalsWithThreshold(
|
if (previous != null && previous.getOrientation().equals(Orientation.LEFT) && block.getOrientation().equals(Orientation.LEFT) && equalsWithThreshold(block.getMaxY(),
|
||||||
block.getMaxY(),
|
|
||||||
previous.getMaxY()) || previous != null && previous.getOrientation().equals(Orientation.LEFT) && block.getOrientation()
|
previous.getMaxY()) || previous != null && previous.getOrientation().equals(Orientation.LEFT) && block.getOrientation()
|
||||||
.equals(Orientation.RIGHT) && equalsWithThreshold(block.getMaxY(), previous.getMaxY())) {
|
.equals(Orientation.RIGHT) && equalsWithThreshold(block.getMaxY(), previous.getMaxY())) {
|
||||||
previous.add(block);
|
previous.add(block);
|
||||||
@ -189,7 +194,6 @@ public class TaasBlockificationService {
|
|||||||
TextPositionSequence prev = null;
|
TextPositionSequence prev = null;
|
||||||
// TODO: make static final constant
|
// TODO: make static final constant
|
||||||
|
|
||||||
|
|
||||||
boolean wasSplitted = false;
|
boolean wasSplitted = false;
|
||||||
Float splitX1 = null;
|
Float splitX1 = null;
|
||||||
for (TextPositionSequence word : textPositions) {
|
for (TextPositionSequence word : textPositions) {
|
||||||
|
|||||||
@ -5,7 +5,6 @@ import java.util.Locale;
|
|||||||
import java.util.regex.Matcher;
|
import java.util.regex.Matcher;
|
||||||
import java.util.regex.Pattern;
|
import java.util.regex.Pattern;
|
||||||
|
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.utils.MarkedContentUtils;
|
|
||||||
import org.springframework.stereotype.Service;
|
import org.springframework.stereotype.Service;
|
||||||
|
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
|
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
|
||||||
@ -13,6 +12,7 @@ import com.knecon.fforesight.service.layoutparser.processor.model.Classification
|
|||||||
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
|
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.PageBlockType;
|
import com.knecon.fforesight.service.layoutparser.processor.model.PageBlockType;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
|
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.utils.MarkedContentUtils;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.utils.PositionUtils;
|
import com.knecon.fforesight.service.layoutparser.processor.utils.PositionUtils;
|
||||||
|
|
||||||
import lombok.RequiredArgsConstructor;
|
import lombok.RequiredArgsConstructor;
|
||||||
@ -63,16 +63,16 @@ public class DocuMineClassificationService {
|
|||||||
textBlock.setClassification(PageBlockType.OTHER);
|
textBlock.setClassification(PageBlockType.OTHER);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
if (MarkedContentUtils.intersects(textBlock, page.getMarkedContentBboxPerType(), MarkedContentUtils.HEADER)
|
if (MarkedContentUtils.intersects(textBlock, page.getMarkedContentBboxPerType(), MarkedContentUtils.HEADER) || PositionUtils.isOverBodyTextFrame(bodyTextFrame,
|
||||||
|| PositionUtils.isOverBodyTextFrame(bodyTextFrame, textBlock, page.getRotation()) && (document.getFontSizeCounter()
|
textBlock,
|
||||||
.getMostPopular() == null || textBlock.getHighestFontSize() <= document.getFontSizeCounter().getMostPopular())
|
page.getRotation()) && (document.getFontSizeCounter().getMostPopular() == null || textBlock.getHighestFontSize() <= document.getFontSizeCounter()
|
||||||
) {
|
.getMostPopular())) {
|
||||||
textBlock.setClassification(PageBlockType.HEADER);
|
textBlock.setClassification(PageBlockType.HEADER);
|
||||||
|
|
||||||
} else if (MarkedContentUtils.intersects(textBlock, page.getMarkedContentBboxPerType(), MarkedContentUtils.FOOTER)
|
} else if (MarkedContentUtils.intersects(textBlock, page.getMarkedContentBboxPerType(), MarkedContentUtils.FOOTER) || PositionUtils.isUnderBodyTextFrame(bodyTextFrame,
|
||||||
|| PositionUtils.isUnderBodyTextFrame(bodyTextFrame, textBlock, page.getRotation()) && (document.getFontSizeCounter()
|
textBlock,
|
||||||
.getMostPopular() == null || textBlock.getHighestFontSize() <= document.getFontSizeCounter().getMostPopular())
|
page.getRotation()) && (document.getFontSizeCounter().getMostPopular() == null || textBlock.getHighestFontSize() <= document.getFontSizeCounter()
|
||||||
) {
|
.getMostPopular())) {
|
||||||
textBlock.setClassification(PageBlockType.FOOTER);
|
textBlock.setClassification(PageBlockType.FOOTER);
|
||||||
} else if (page.getPageNumber() == 1 && (PositionUtils.getHeightDifferenceBetweenChunkWordAndDocumentWord(textBlock,
|
} else if (page.getPageNumber() == 1 && (PositionUtils.getHeightDifferenceBetweenChunkWordAndDocumentWord(textBlock,
|
||||||
document.getTextHeightCounter().getMostPopular()) > 2.5 && textBlock.getHighestFontSize() > document.getFontSizeCounter().getMostPopular() || page.getTextBlocks()
|
document.getTextHeightCounter().getMostPopular()) > 2.5 && textBlock.getHighestFontSize() > document.getFontSizeCounter().getMostPopular() || page.getTextBlocks()
|
||||||
|
|||||||
@ -3,7 +3,6 @@ package com.knecon.fforesight.service.layoutparser.processor.services.classifica
|
|||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.regex.Pattern;
|
import java.util.regex.Pattern;
|
||||||
|
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.utils.MarkedContentUtils;
|
|
||||||
import org.springframework.stereotype.Service;
|
import org.springframework.stereotype.Service;
|
||||||
|
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
|
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
|
||||||
@ -11,6 +10,7 @@ import com.knecon.fforesight.service.layoutparser.processor.model.Classification
|
|||||||
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
|
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.PageBlockType;
|
import com.knecon.fforesight.service.layoutparser.processor.model.PageBlockType;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
|
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.utils.MarkedContentUtils;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.utils.PositionUtils;
|
import com.knecon.fforesight.service.layoutparser.processor.utils.PositionUtils;
|
||||||
|
|
||||||
import lombok.RequiredArgsConstructor;
|
import lombok.RequiredArgsConstructor;
|
||||||
@ -21,7 +21,6 @@ import lombok.extern.slf4j.Slf4j;
|
|||||||
@RequiredArgsConstructor
|
@RequiredArgsConstructor
|
||||||
public class RedactManagerClassificationService {
|
public class RedactManagerClassificationService {
|
||||||
|
|
||||||
|
|
||||||
public void classifyDocument(ClassificationDocument document) {
|
public void classifyDocument(ClassificationDocument document) {
|
||||||
|
|
||||||
List<Float> headlineFontSizes = document.getFontSizeCounter().getHighterThanMostPopular();
|
List<Float> headlineFontSizes = document.getFontSizeCounter().getHighterThanMostPopular();
|
||||||
@ -52,14 +51,16 @@ public class RedactManagerClassificationService {
|
|||||||
textBlock.setClassification(PageBlockType.OTHER);
|
textBlock.setClassification(PageBlockType.OTHER);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
if (MarkedContentUtils.intersects(textBlock, page.getMarkedContentBboxPerType(), MarkedContentUtils.HEADER)
|
if (MarkedContentUtils.intersects(textBlock, page.getMarkedContentBboxPerType(), MarkedContentUtils.HEADER) || PositionUtils.isOverBodyTextFrame(bodyTextFrame,
|
||||||
|| PositionUtils.isOverBodyTextFrame(bodyTextFrame, textBlock, page.getRotation()) && (document.getFontSizeCounter()
|
textBlock,
|
||||||
.getMostPopular() == null || textBlock.getHighestFontSize() <= document.getFontSizeCounter().getMostPopular())) {
|
page.getRotation()) && (document.getFontSizeCounter().getMostPopular() == null || textBlock.getHighestFontSize() <= document.getFontSizeCounter()
|
||||||
|
.getMostPopular())) {
|
||||||
textBlock.setClassification(PageBlockType.HEADER);
|
textBlock.setClassification(PageBlockType.HEADER);
|
||||||
|
|
||||||
} else if (MarkedContentUtils.intersects(textBlock, page.getMarkedContentBboxPerType(), MarkedContentUtils.FOOTER)
|
} else if (MarkedContentUtils.intersects(textBlock, page.getMarkedContentBboxPerType(), MarkedContentUtils.FOOTER) || PositionUtils.isUnderBodyTextFrame(bodyTextFrame,
|
||||||
|| PositionUtils.isUnderBodyTextFrame(bodyTextFrame, textBlock, page.getRotation()) && (document.getFontSizeCounter()
|
textBlock,
|
||||||
.getMostPopular() == null || textBlock.getHighestFontSize() <= document.getFontSizeCounter().getMostPopular())) {
|
page.getRotation()) && (document.getFontSizeCounter().getMostPopular() == null || textBlock.getHighestFontSize() <= document.getFontSizeCounter()
|
||||||
|
.getMostPopular())) {
|
||||||
textBlock.setClassification(PageBlockType.FOOTER);
|
textBlock.setClassification(PageBlockType.FOOTER);
|
||||||
} else if (page.getPageNumber() == 1 && (PositionUtils.getHeightDifferenceBetweenChunkWordAndDocumentWord(textBlock,
|
} else if (page.getPageNumber() == 1 && (PositionUtils.getHeightDifferenceBetweenChunkWordAndDocumentWord(textBlock,
|
||||||
document.getTextHeightCounter().getMostPopular()) > 2.5 && textBlock.getHighestFontSize() > document.getFontSizeCounter().getMostPopular() || page.getTextBlocks()
|
document.getTextHeightCounter().getMostPopular()) > 2.5 && textBlock.getHighestFontSize() > document.getFontSizeCounter().getMostPopular() || page.getTextBlocks()
|
||||||
|
|||||||
@ -3,7 +3,6 @@ package com.knecon.fforesight.service.layoutparser.processor.services.classifica
|
|||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.regex.Pattern;
|
import java.util.regex.Pattern;
|
||||||
|
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.utils.MarkedContentUtils;
|
|
||||||
import org.springframework.stereotype.Service;
|
import org.springframework.stereotype.Service;
|
||||||
|
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
|
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
|
||||||
@ -12,6 +11,7 @@ import com.knecon.fforesight.service.layoutparser.processor.model.Classification
|
|||||||
import com.knecon.fforesight.service.layoutparser.processor.model.PageBlockType;
|
import com.knecon.fforesight.service.layoutparser.processor.model.PageBlockType;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
|
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.services.BodyTextFrameService;
|
import com.knecon.fforesight.service.layoutparser.processor.services.BodyTextFrameService;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.utils.MarkedContentUtils;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.utils.PositionUtils;
|
import com.knecon.fforesight.service.layoutparser.processor.utils.PositionUtils;
|
||||||
|
|
||||||
import lombok.RequiredArgsConstructor;
|
import lombok.RequiredArgsConstructor;
|
||||||
@ -27,7 +27,6 @@ public class TaasClassificationService {
|
|||||||
|
|
||||||
public void classifyDocument(ClassificationDocument document) {
|
public void classifyDocument(ClassificationDocument document) {
|
||||||
|
|
||||||
|
|
||||||
List<Float> headlineFontSizes = document.getFontSizeCounter().getHighterThanMostPopular();
|
List<Float> headlineFontSizes = document.getFontSizeCounter().getHighterThanMostPopular();
|
||||||
|
|
||||||
log.debug("Document FontSize counters are: {}", document.getFontSizeCounter().getCountPerValue());
|
log.debug("Document FontSize counters are: {}", document.getFontSizeCounter().getCountPerValue());
|
||||||
@ -57,11 +56,13 @@ public class TaasClassificationService {
|
|||||||
textBlock.setClassification(PageBlockType.OTHER);
|
textBlock.setClassification(PageBlockType.OTHER);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
if (MarkedContentUtils.intersects(textBlock, page.getMarkedContentBboxPerType(), MarkedContentUtils.HEADER)
|
if (MarkedContentUtils.intersects(textBlock, page.getMarkedContentBboxPerType(), MarkedContentUtils.HEADER) || PositionUtils.isOverBodyTextFrame(bodyTextFrame,
|
||||||
|| PositionUtils.isOverBodyTextFrame(bodyTextFrame, textBlock, page.getRotation())) {
|
textBlock,
|
||||||
|
page.getRotation())) {
|
||||||
textBlock.setClassification(PageBlockType.HEADER);
|
textBlock.setClassification(PageBlockType.HEADER);
|
||||||
} else if (MarkedContentUtils.intersects(textBlock, page.getMarkedContentBboxPerType(), MarkedContentUtils.FOOTER)
|
} else if (MarkedContentUtils.intersects(textBlock, page.getMarkedContentBboxPerType(), MarkedContentUtils.FOOTER) || PositionUtils.isUnderBodyTextFrame(bodyTextFrame,
|
||||||
|| PositionUtils.isUnderBodyTextFrame(bodyTextFrame, textBlock, page.getRotation())) {
|
textBlock,
|
||||||
|
page.getRotation())) {
|
||||||
textBlock.setClassification(PageBlockType.FOOTER);
|
textBlock.setClassification(PageBlockType.FOOTER);
|
||||||
} else if (page.getPageNumber() == 1 && (PositionUtils.getHeightDifferenceBetweenChunkWordAndDocumentWord(textBlock,
|
} else if (page.getPageNumber() == 1 && (PositionUtils.getHeightDifferenceBetweenChunkWordAndDocumentWord(textBlock,
|
||||||
document.getTextHeightCounter().getMostPopular()) > 2.5 && textBlock.getHighestFontSize() > document.getFontSizeCounter().getMostPopular() || page.getTextBlocks()
|
document.getTextHeightCounter().getMostPopular()) > 2.5 && textBlock.getHighestFontSize() > document.getFontSizeCounter().getMostPopular() || page.getTextBlocks()
|
||||||
|
|||||||
@ -18,8 +18,6 @@ import com.knecon.fforesight.service.layoutparser.processor.model.Classification
|
|||||||
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationFooter;
|
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationFooter;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationHeader;
|
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationHeader;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
|
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.image.ClassifiedImage;
|
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
|
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.DocumentTree;
|
import com.knecon.fforesight.service.layoutparser.processor.model.graph.DocumentTree;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document;
|
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Footer;
|
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Footer;
|
||||||
@ -31,6 +29,8 @@ import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Pa
|
|||||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Paragraph;
|
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Paragraph;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Section;
|
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Section;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.AtomicTextBlock;
|
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.AtomicTextBlock;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.model.image.ClassifiedImage;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.utils.IdBuilder;
|
import com.knecon.fforesight.service.layoutparser.processor.utils.IdBuilder;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.utils.TextPositionOperations;
|
import com.knecon.fforesight.service.layoutparser.processor.utils.TextPositionOperations;
|
||||||
|
|
||||||
|
|||||||
@ -8,10 +8,10 @@ import java.util.List;
|
|||||||
import java.util.Locale;
|
import java.util.Locale;
|
||||||
import java.util.Objects;
|
import java.util.Objects;
|
||||||
|
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.model.graph.Boundary;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.RedTextPosition;
|
import com.knecon.fforesight.service.layoutparser.processor.model.text.RedTextPosition;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextDirection;
|
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextDirection;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
|
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.Boundary;
|
|
||||||
|
|
||||||
import lombok.experimental.UtilityClass;
|
import lombok.experimental.UtilityClass;
|
||||||
|
|
||||||
@ -110,6 +110,7 @@ public class SearchTextWithTextPositionFactory {
|
|||||||
return context.stringIdx - context.lastHyphenIdx < MAX_HYPHEN_LINEBREAK_DISTANCE;
|
return context.stringIdx - context.lastHyphenIdx < MAX_HYPHEN_LINEBREAK_DISTANCE;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
private static List<Boundary> mergeToBoundaries(List<Integer> integers) {
|
private static List<Boundary> mergeToBoundaries(List<Integer> integers) {
|
||||||
|
|
||||||
if (integers.isEmpty()) {
|
if (integers.isEmpty()) {
|
||||||
@ -125,8 +126,9 @@ public class SearchTextWithTextPositionFactory {
|
|||||||
}
|
}
|
||||||
end = current + 1;
|
end = current + 1;
|
||||||
}
|
}
|
||||||
if (boundaries.isEmpty())
|
if (boundaries.isEmpty()) {
|
||||||
boundaries.add(new Boundary(start, end));
|
boundaries.add(new Boundary(start, end));
|
||||||
|
}
|
||||||
return boundaries;
|
return boundaries;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -138,6 +140,7 @@ public class SearchTextWithTextPositionFactory {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
private boolean isLineBreak(RedTextPosition currentTextPosition, RedTextPosition previousTextPosition) {
|
private boolean isLineBreak(RedTextPosition currentTextPosition, RedTextPosition previousTextPosition) {
|
||||||
|
|
||||||
return Objects.equals(currentTextPosition.getUnicode(), "\n") || isDeltaYLargerThanTextHeight(currentTextPosition, previousTextPosition);
|
return Objects.equals(currentTextPosition.getUnicode(), "\n") || isDeltaYLargerThanTextHeight(currentTextPosition, previousTextPosition);
|
||||||
@ -163,17 +166,7 @@ public class SearchTextWithTextPositionFactory {
|
|||||||
|
|
||||||
private boolean isHyphen(String unicodeCharacter) {
|
private boolean isHyphen(String unicodeCharacter) {
|
||||||
|
|
||||||
return Objects.equals(unicodeCharacter, "-") || //
|
return false;
|
||||||
Objects.equals(unicodeCharacter, "~") || //
|
|
||||||
Objects.equals(unicodeCharacter, "‐") || //
|
|
||||||
Objects.equals(unicodeCharacter, "‒") || //
|
|
||||||
Objects.equals(unicodeCharacter, "⁻") || //
|
|
||||||
Objects.equals(unicodeCharacter, "−") || //
|
|
||||||
Objects.equals(unicodeCharacter, "﹣") || //
|
|
||||||
Objects.equals(unicodeCharacter, "゠") || //
|
|
||||||
Objects.equals(unicodeCharacter, "⁓") || //
|
|
||||||
Objects.equals(unicodeCharacter, "‑") || //
|
|
||||||
Objects.equals(unicodeCharacter, "\u00AD");
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@ -11,12 +11,12 @@ import java.util.Map;
|
|||||||
import java.util.Set;
|
import java.util.Set;
|
||||||
|
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
|
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.image.ClassifiedImage;
|
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.TablePageBlock;
|
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
|
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.GenericSemanticNode;
|
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.GenericSemanticNode;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Page;
|
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Page;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Section;
|
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Section;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.model.image.ClassifiedImage;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.model.table.TablePageBlock;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.utils.TableMergingUtility;
|
import com.knecon.fforesight.service.layoutparser.processor.utils.TableMergingUtility;
|
||||||
|
|
||||||
import lombok.experimental.UtilityClass;
|
import lombok.experimental.UtilityClass;
|
||||||
|
|||||||
@ -8,15 +8,15 @@ import java.util.Set;
|
|||||||
import java.util.stream.Collectors;
|
import java.util.stream.Collectors;
|
||||||
|
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
|
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.Cell;
|
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.TablePageBlock;
|
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
|
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.GenericSemanticNode;
|
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.GenericSemanticNode;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Page;
|
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Page;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.SemanticNode;
|
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.SemanticNode;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Table;
|
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Table;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.TableCell;
|
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.TableCell;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlock;
|
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlock;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.model.table.Cell;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.model.table.TablePageBlock;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.utils.TextPositionOperations;
|
import com.knecon.fforesight.service.layoutparser.processor.utils.TextPositionOperations;
|
||||||
|
|
||||||
import lombok.experimental.UtilityClass;
|
import lombok.experimental.UtilityClass;
|
||||||
|
|||||||
@ -2,10 +2,10 @@ package com.knecon.fforesight.service.layoutparser.processor.services.factory;
|
|||||||
|
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
|
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Page;
|
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Page;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.SemanticNode;
|
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.SemanticNode;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.AtomicTextBlock;
|
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.AtomicTextBlock;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
|
||||||
|
|
||||||
import lombok.AccessLevel;
|
import lombok.AccessLevel;
|
||||||
import lombok.experimental.FieldDefaults;
|
import lombok.experimental.FieldDefaults;
|
||||||
|
|||||||
@ -7,11 +7,11 @@ import java.util.List;
|
|||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
import java.util.NoSuchElementException;
|
import java.util.NoSuchElementException;
|
||||||
|
|
||||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentPositionData;
|
|
||||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentTextData;
|
|
||||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentData;
|
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentData;
|
||||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentStructure;
|
|
||||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentPage;
|
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentPage;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentPositionData;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentStructure;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentTextData;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.DocumentTree;
|
import com.knecon.fforesight.service.layoutparser.processor.model.graph.DocumentTree;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document;
|
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Footer;
|
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Footer;
|
||||||
|
|||||||
@ -1,7 +1,6 @@
|
|||||||
package com.knecon.fforesight.service.layoutparser.processor.services.mapper;
|
package com.knecon.fforesight.service.layoutparser.processor.services.mapper;
|
||||||
|
|
||||||
import java.awt.geom.Rectangle2D;
|
import java.awt.geom.Rectangle2D;
|
||||||
import java.util.Collections;
|
|
||||||
import java.util.HashMap;
|
import java.util.HashMap;
|
||||||
import java.util.Locale;
|
import java.util.Locale;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
@ -9,7 +8,6 @@ import java.util.Map;
|
|||||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentStructure;
|
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentStructure;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Image;
|
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Image;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.ImageType;
|
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.ImageType;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Page;
|
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Table;
|
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Table;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.TableCell;
|
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.TableCell;
|
||||||
|
|
||||||
|
|||||||
@ -329,6 +329,7 @@ public class PDFLinesTextStripper extends PDFTextStripper {
|
|||||||
.getXDirAdj() - (previous.getXDirAdj() + previous.getWidthDirAdj()) < maximumGapSize;
|
.getXDirAdj() - (previous.getXDirAdj() + previous.getWidthDirAdj()) < maximumGapSize;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public String getText(PDDocument doc) throws IOException {
|
public String getText(PDDocument doc) throws IOException {
|
||||||
|
|
||||||
|
|||||||
@ -25,10 +25,23 @@ import java.io.StringWriter;
|
|||||||
import java.io.Writer;
|
import java.io.Writer;
|
||||||
import java.text.Bidi;
|
import java.text.Bidi;
|
||||||
import java.text.Normalizer;
|
import java.text.Normalizer;
|
||||||
import java.util.*;
|
import java.util.ArrayDeque;
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.Collections;
|
||||||
|
import java.util.Comparator;
|
||||||
|
import java.util.Deque;
|
||||||
|
import java.util.HashMap;
|
||||||
|
import java.util.Iterator;
|
||||||
|
import java.util.LinkedList;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.Map;
|
||||||
|
import java.util.SortedMap;
|
||||||
|
import java.util.SortedSet;
|
||||||
|
import java.util.StringTokenizer;
|
||||||
|
import java.util.TreeMap;
|
||||||
|
import java.util.TreeSet;
|
||||||
import java.util.regex.Pattern;
|
import java.util.regex.Pattern;
|
||||||
|
|
||||||
import lombok.Getter;
|
|
||||||
import org.apache.commons.logging.Log;
|
import org.apache.commons.logging.Log;
|
||||||
import org.apache.commons.logging.LogFactory;
|
import org.apache.commons.logging.LogFactory;
|
||||||
import org.apache.pdfbox.cos.COSDictionary;
|
import org.apache.pdfbox.cos.COSDictionary;
|
||||||
@ -46,6 +59,8 @@ import org.apache.pdfbox.text.TextPositionComparator;
|
|||||||
|
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.utils.QuickSort;
|
import com.knecon.fforesight.service.layoutparser.processor.utils.QuickSort;
|
||||||
|
|
||||||
|
import lombok.Getter;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* This is just a copy except i only adjusted lines 594-607 cause this is a bug in Pdfbox.
|
* This is just a copy except i only adjusted lines 594-607 cause this is a bug in Pdfbox.
|
||||||
* see S416.pdf
|
* see S416.pdf
|
||||||
@ -194,40 +209,33 @@ public class PDFTextStripper extends LegacyPDFStreamEngine {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public void beginMarkedContentSequence(COSName tag, COSDictionary properties) {
|
||||||
|
|
||||||
public void beginMarkedContentSequence(COSName tag, COSDictionary properties)
|
|
||||||
{
|
|
||||||
PDMarkedContent markedContent = PDMarkedContent.create(tag, properties);
|
PDMarkedContent markedContent = PDMarkedContent.create(tag, properties);
|
||||||
if (this.currentMarkedContents.isEmpty())
|
if (this.currentMarkedContents.isEmpty()) {
|
||||||
{
|
|
||||||
this.markedContents.add(markedContent);
|
this.markedContents.add(markedContent);
|
||||||
}
|
} else {
|
||||||
else
|
PDMarkedContent currentMarkedContent = this.currentMarkedContents.peek();
|
||||||
{
|
if (currentMarkedContent != null) {
|
||||||
PDMarkedContent currentMarkedContent =
|
|
||||||
this.currentMarkedContents.peek();
|
|
||||||
if (currentMarkedContent != null)
|
|
||||||
{
|
|
||||||
currentMarkedContent.addMarkedContent(markedContent);
|
currentMarkedContent.addMarkedContent(markedContent);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
this.currentMarkedContents.push(markedContent);
|
this.currentMarkedContents.push(markedContent);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void endMarkedContentSequence()
|
public void endMarkedContentSequence() {
|
||||||
{
|
|
||||||
if (!this.currentMarkedContents.isEmpty())
|
if (!this.currentMarkedContents.isEmpty()) {
|
||||||
{
|
|
||||||
this.currentMarkedContents.pop();
|
this.currentMarkedContents.pop();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
public void xobject(PDXObject xobject)
|
public void xobject(PDXObject xobject) {
|
||||||
{
|
|
||||||
if (!this.currentMarkedContents.isEmpty())
|
if (!this.currentMarkedContents.isEmpty()) {
|
||||||
{
|
|
||||||
this.currentMarkedContents.peek().addXObject(xobject);
|
this.currentMarkedContents.peek().addXObject(xobject);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -635,7 +643,6 @@ public class PDFTextStripper extends LegacyPDFStreamEngine {
|
|||||||
var normalized = normalize(line);
|
var normalized = normalize(line);
|
||||||
// normalized.stream().filter(l -> System.out.println(l.getText().contains("Plenarprotokoll 20/24")).findFirst().isPresent()
|
// normalized.stream().filter(l -> System.out.println(l.getText().contains("Plenarprotokoll 20/24")).findFirst().isPresent()
|
||||||
|
|
||||||
|
|
||||||
lastLineStartPosition = handleLineSeparation(current, lastPosition, lastLineStartPosition, maxHeightForLine);
|
lastLineStartPosition = handleLineSeparation(current, lastPosition, lastLineStartPosition, maxHeightForLine);
|
||||||
writeLine(normalized, current.isParagraphStart);
|
writeLine(normalized, current.isParagraphStart);
|
||||||
line.clear();
|
line.clear();
|
||||||
@ -914,8 +921,7 @@ public class PDFTextStripper extends LegacyPDFStreamEngine {
|
|||||||
textList.add(text);
|
textList.add(text);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (!this.currentMarkedContents.isEmpty())
|
if (!this.currentMarkedContents.isEmpty()) {
|
||||||
{
|
|
||||||
this.currentMarkedContents.peek().addText(text);
|
this.currentMarkedContents.peek().addText(text);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -2102,7 +2108,9 @@ public class PDFTextStripper extends LegacyPDFStreamEngine {
|
|||||||
return endParagraphWritten;
|
return endParagraphWritten;
|
||||||
}
|
}
|
||||||
|
|
||||||
public void setEndParagraphWritten(){
|
|
||||||
|
public void setEndParagraphWritten() {
|
||||||
|
|
||||||
endParagraphWritten = true;
|
endParagraphWritten = true;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -2145,7 +2153,6 @@ public class PDFTextStripper extends LegacyPDFStreamEngine {
|
|||||||
this.isHangingIndent = true;
|
this.isHangingIndent = true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|||||||
@ -1,10 +1,13 @@
|
|||||||
package com.knecon.fforesight.service.layoutparser.processor.services.visualization;
|
package com.knecon.fforesight.service.layoutparser.processor.services.visualization;
|
||||||
|
|
||||||
|
import java.awt.Color;
|
||||||
import java.awt.geom.AffineTransform;
|
import java.awt.geom.AffineTransform;
|
||||||
import java.awt.geom.Rectangle2D;
|
import java.awt.geom.Rectangle2D;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.io.OutputStream;
|
import java.io.OutputStream;
|
||||||
import java.util.HashSet;
|
import java.util.HashSet;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.Map;
|
||||||
import java.util.Set;
|
import java.util.Set;
|
||||||
|
|
||||||
import org.apache.pdfbox.cos.COSDictionary;
|
import org.apache.pdfbox.cos.COSDictionary;
|
||||||
@ -30,6 +33,7 @@ import com.knecon.fforesight.service.layoutparser.processor.model.visualization.
|
|||||||
import com.knecon.fforesight.service.layoutparser.processor.model.visualization.LayoutGrid;
|
import com.knecon.fforesight.service.layoutparser.processor.model.visualization.LayoutGrid;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.visualization.PlacedText;
|
import com.knecon.fforesight.service.layoutparser.processor.model.visualization.PlacedText;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.visualization.VisualizationsOnPage;
|
import com.knecon.fforesight.service.layoutparser.processor.model.visualization.VisualizationsOnPage;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableExtractorCells;
|
||||||
|
|
||||||
import lombok.RequiredArgsConstructor;
|
import lombok.RequiredArgsConstructor;
|
||||||
import lombok.SneakyThrows;
|
import lombok.SneakyThrows;
|
||||||
@ -40,7 +44,6 @@ import lombok.extern.slf4j.Slf4j;
|
|||||||
@RequiredArgsConstructor
|
@RequiredArgsConstructor
|
||||||
public class ViewerDocumentService {
|
public class ViewerDocumentService {
|
||||||
|
|
||||||
|
|
||||||
private static final String LAYER_NAME = "Layout grid";
|
private static final String LAYER_NAME = "Layout grid";
|
||||||
private static final int FONT_SIZE = 10;
|
private static final int FONT_SIZE = 10;
|
||||||
public static final float LINE_WIDTH = 1f;
|
public static final float LINE_WIDTH = 1f;
|
||||||
@ -49,13 +52,18 @@ public class ViewerDocumentService {
|
|||||||
|
|
||||||
|
|
||||||
@SneakyThrows
|
@SneakyThrows
|
||||||
public void createViewerDocument(PDDocument pdDocument, Document document, OutputStream outputStream, boolean layerVisibilityDefaultValue) {
|
public void createViewerDocument(PDDocument pdDocument,
|
||||||
|
Document document,
|
||||||
|
OutputStream outputStream,
|
||||||
|
Map<Integer,List<TableExtractorCells>> extractedTableCells,
|
||||||
|
boolean layerVisibilityDefaultValue) {
|
||||||
|
|
||||||
LayoutGrid layoutGrid = layoutGridService.createLayoutGrid(document);
|
LayoutGrid layoutGrid = layoutGridService.createLayoutGrid(document);
|
||||||
// PDDocument.save() is very slow, since it actually traverses the entire pdf and writes a new one.
|
// PDDocument.save() is very slow, since it actually traverses the entire pdf and writes a new one.
|
||||||
// If we collect all COSDictionaries we changed and tell it explicitly to only add the changed ones by using saveIncremental it's very fast.
|
// If we collect all COSDictionaries we changed and tell it explicitly to only add the changed ones by using saveIncremental it's very fast.
|
||||||
Set<COSDictionary> dictionariesToUpdate = new HashSet<>();
|
Set<COSDictionary> dictionariesToUpdate = new HashSet<>();
|
||||||
PDOptionalContentGroup layer = addLayerToDocument(pdDocument, dictionariesToUpdate, layerVisibilityDefaultValue);
|
PDOptionalContentGroup layer = addLayerToDocument(pdDocument, dictionariesToUpdate, layerVisibilityDefaultValue);
|
||||||
|
PDOptionalContentGroup visualLayoutParsingLayer = addLayerToDocument(pdDocument, dictionariesToUpdate, true);
|
||||||
PDFont font = new PDType1Font(Standard14Fonts.FontName.HELVETICA);
|
PDFont font = new PDType1Font(Standard14Fonts.FontName.HELVETICA);
|
||||||
|
|
||||||
for (int pageNumber = 0; pageNumber < pdDocument.getNumberOfPages(); pageNumber++) {
|
for (int pageNumber = 0; pageNumber < pdDocument.getNumberOfPages(); pageNumber++) {
|
||||||
@ -72,44 +80,25 @@ public class ViewerDocumentService {
|
|||||||
assert pageNumber == visualizationsOnPage.getPageNumber();
|
assert pageNumber == visualizationsOnPage.getPageNumber();
|
||||||
// We need to append to the content stream, otherwise the content could be overlapped by following content.
|
// We need to append to the content stream, otherwise the content could be overlapped by following content.
|
||||||
try (var contentStream = new PDPageContentStream(pdDocument, pdPage, PDPageContentStream.AppendMode.APPEND, true)) {
|
try (var contentStream = new PDPageContentStream(pdDocument, pdPage, PDPageContentStream.AppendMode.APPEND, true)) {
|
||||||
|
contentStream.beginMarkedContent(COSName.OC, visualLayoutParsingLayer);
|
||||||
contentStream.beginMarkedContent(COSName.OC, layer);
|
|
||||||
contentStream.saveGraphicsState();
|
contentStream.saveGraphicsState();
|
||||||
|
|
||||||
contentStream.setLineWidth(LINE_WIDTH);
|
contentStream.setLineWidth(LINE_WIDTH);
|
||||||
for (ColoredLine coloredLine : visualizationsOnPage.getColoredLines()) {
|
for (TableExtractorCells tableCells : extractedTableCells.get(pageNumber)) {
|
||||||
contentStream.setStrokingColor(coloredLine.color());
|
contentStream.setStrokingColor(new Color(0xB700FF));
|
||||||
contentStream.moveTo((float) coloredLine.line().getX1(), (float) coloredLine.line().getY1());
|
contentStream.addRect((float) tableCells.getX0(), (float) tableCells.getY0(), (float) tableCells.getWidth(), (float) tableCells.getHeight());
|
||||||
contentStream.lineTo((float) coloredLine.line().getX2(), (float) coloredLine.line().getY2());
|
|
||||||
contentStream.stroke();
|
contentStream.stroke();
|
||||||
}
|
|
||||||
for (ColoredRectangle coloredRectangle : visualizationsOnPage.getColoredRectangles()) {
|
|
||||||
contentStream.setStrokingColor(coloredRectangle.color());
|
|
||||||
Rectangle2D r = coloredRectangle.rectangle2D();
|
|
||||||
contentStream.addRect((float) r.getX(), (float) r.getY(), (float) r.getWidth(), (float) r.getHeight());
|
|
||||||
contentStream.stroke();
|
|
||||||
}
|
|
||||||
for (FilledRectangle filledRectangle : visualizationsOnPage.getFilledRectangles()) {
|
|
||||||
contentStream.setNonStrokingColor(filledRectangle.color());
|
|
||||||
PDExtendedGraphicsState graphicsState = new PDExtendedGraphicsState();
|
|
||||||
graphicsState.setNonStrokingAlphaConstant(filledRectangle.alpha());
|
|
||||||
contentStream.setGraphicsStateParameters(graphicsState);
|
|
||||||
Rectangle2D r = filledRectangle.rectangle2D();
|
|
||||||
contentStream.addRect((float) r.getX(), (float) r.getY(), (float) r.getWidth(), (float) r.getHeight());
|
|
||||||
contentStream.fill();
|
|
||||||
}
|
|
||||||
for (PlacedText placedText : visualizationsOnPage.getPlacedTexts()) {
|
|
||||||
contentStream.setFont(font, FONT_SIZE);
|
contentStream.setFont(font, FONT_SIZE);
|
||||||
contentStream.beginText();
|
contentStream.beginText();
|
||||||
Matrix textMatrix = new Matrix((float) textDeRotationMatrix.getScaleX(),
|
Matrix textMatrix = new Matrix((float) textDeRotationMatrix.getScaleX(),
|
||||||
(float) textDeRotationMatrix.getShearX(),
|
(float) textDeRotationMatrix.getShearX(),
|
||||||
(float) textDeRotationMatrix.getShearY(),
|
(float) textDeRotationMatrix.getShearY(),
|
||||||
(float) textDeRotationMatrix.getScaleY(),
|
(float) textDeRotationMatrix.getScaleY(),
|
||||||
(float) placedText.lineStart().getX(),
|
tableCells.getX0() ,
|
||||||
(float) placedText.lineStart().getY());
|
tableCells.getY0());
|
||||||
textMatrix.translate(-((font.getStringWidth(placedText.text()) / 1000) * FONT_SIZE + (2 * LINE_WIDTH) + 4), -FONT_SIZE);
|
textMatrix.translate(-((font.getStringWidth(tableCells.getLabel()) / 1000) * FONT_SIZE + (2 * LINE_WIDTH) + 4), -FONT_SIZE);
|
||||||
contentStream.setTextMatrix(textMatrix);
|
contentStream.setTextMatrix(textMatrix);
|
||||||
contentStream.showText(placedText.text());
|
contentStream.showText(tableCells.getLabel());
|
||||||
contentStream.endText();
|
contentStream.endText();
|
||||||
}
|
}
|
||||||
contentStream.restoreGraphicsState();
|
contentStream.restoreGraphicsState();
|
||||||
|
|||||||
@ -1,12 +1,5 @@
|
|||||||
package com.knecon.fforesight.service.layoutparser.processor.utils;
|
package com.knecon.fforesight.service.layoutparser.processor.utils;
|
||||||
|
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
|
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
|
|
||||||
import lombok.experimental.UtilityClass;
|
|
||||||
import org.apache.pdfbox.cos.COSName;
|
|
||||||
import org.apache.pdfbox.pdmodel.documentinterchange.markedcontent.PDMarkedContent;
|
|
||||||
import org.apache.pdfbox.text.TextPosition;
|
|
||||||
|
|
||||||
import java.awt.geom.Rectangle2D;
|
import java.awt.geom.Rectangle2D;
|
||||||
import java.util.Collection;
|
import java.util.Collection;
|
||||||
import java.util.Collections;
|
import java.util.Collections;
|
||||||
@ -14,12 +7,22 @@ import java.util.List;
|
|||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
import java.util.stream.Collectors;
|
import java.util.stream.Collectors;
|
||||||
|
|
||||||
|
import org.apache.pdfbox.cos.COSName;
|
||||||
|
import org.apache.pdfbox.pdmodel.documentinterchange.markedcontent.PDMarkedContent;
|
||||||
|
import org.apache.pdfbox.text.TextPosition;
|
||||||
|
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
|
||||||
|
|
||||||
|
import lombok.experimental.UtilityClass;
|
||||||
|
|
||||||
@UtilityClass
|
@UtilityClass
|
||||||
public class MarkedContentUtils {
|
public class MarkedContentUtils {
|
||||||
|
|
||||||
public static final String HEADER = "Header";
|
public static final String HEADER = "Header";
|
||||||
public static final String FOOTER = "Footer";
|
public static final String FOOTER = "Footer";
|
||||||
|
|
||||||
|
|
||||||
public List<Rectangle2D> getMarkedContentBboxPerLine(List<PDMarkedContent> markedContents, String subtype) {
|
public List<Rectangle2D> getMarkedContentBboxPerLine(List<PDMarkedContent> markedContents, String subtype) {
|
||||||
|
|
||||||
if (markedContents == null) {
|
if (markedContents == null) {
|
||||||
@ -31,7 +34,8 @@ public class MarkedContentUtils {
|
|||||||
.filter(m -> m.getProperties() != null)
|
.filter(m -> m.getProperties() != null)
|
||||||
.filter(m -> m.getProperties().getItem("Subtype") != null)
|
.filter(m -> m.getProperties().getItem("Subtype") != null)
|
||||||
.filter(m -> ((COSName) m.getProperties().getItem("Subtype")).getName().equals(subtype))
|
.filter(m -> ((COSName) m.getProperties().getItem("Subtype")).getName().equals(subtype))
|
||||||
.map(PDMarkedContent::getContents).flatMap(Collection::stream)
|
.map(PDMarkedContent::getContents)
|
||||||
|
.flatMap(Collection::stream)
|
||||||
.filter(t -> t instanceof TextPosition)
|
.filter(t -> t instanceof TextPosition)
|
||||||
.map(t -> (TextPosition) t)
|
.map(t -> (TextPosition) t)
|
||||||
.filter(t -> !t.getUnicode().equals(" "))
|
.filter(t -> !t.getUnicode().equals(" "))
|
||||||
@ -41,16 +45,19 @@ public class MarkedContentUtils {
|
|||||||
return Collections.emptyList();
|
return Collections.emptyList();
|
||||||
}
|
}
|
||||||
|
|
||||||
return markedContentByYPosition.values().stream()
|
return markedContentByYPosition.values()
|
||||||
.map(textPositions -> new TextPositionSequence(textPositions.stream()
|
.stream()
|
||||||
.toList(), 0, true)
|
.map(textPositions -> new TextPositionSequence(textPositions.stream().toList(), 0, true).getRectangle())
|
||||||
.getRectangle())
|
.map(t -> new Rectangle2D.Float(t.getTopLeft().getX(), t.getTopLeft().getY() - Math.abs(t.getHeight()), t.getWidth(), Math.abs(t.getHeight())))
|
||||||
.map(t -> new Rectangle2D.Float(t.getTopLeft().getX(), t.getTopLeft().getY() - Math.abs(t.getHeight()), t.getWidth(), Math.abs(t.getHeight()))).collect(Collectors.toList());
|
.collect(Collectors.toList());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
public boolean intersects(TextPageBlock textBlock, Map<String, List<Rectangle2D>> markedContentBboxPerType, String type) {
|
public boolean intersects(TextPageBlock textBlock, Map<String, List<Rectangle2D>> markedContentBboxPerType, String type) {
|
||||||
return markedContentBboxPerType.get(type) != null && markedContentBboxPerType.get(type).stream().anyMatch(rectangle -> rectangle.intersects(textBlock.getPdfMinX(), textBlock.getPdfMinY(), textBlock.getWidth(), textBlock.getHeight()));
|
|
||||||
|
return markedContentBboxPerType.get(type) != null && markedContentBboxPerType.get(type)
|
||||||
|
.stream()
|
||||||
|
.anyMatch(rectangle -> rectangle.intersects(textBlock.getPdfMinX(), textBlock.getPdfMinY(), textBlock.getWidth(), textBlock.getHeight()));
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|||||||
@ -19,10 +19,9 @@ public final class PositionUtils {
|
|||||||
|
|
||||||
double threshold = textBlock.getMostPopularWordHeight() * 3;
|
double threshold = textBlock.getMostPopularWordHeight() * 3;
|
||||||
|
|
||||||
if (textBlock.getPdfMinX() + threshold > btf.getTopLeft().getX()
|
if (textBlock.getPdfMinX() + threshold > btf.getTopLeft().getX() && textBlock.getPdfMaxX() - threshold < btf.getTopLeft()
|
||||||
&& textBlock.getPdfMaxX() - threshold < btf.getTopLeft().getX() + btf.getWidth()
|
.getX() + btf.getWidth() && textBlock.getPdfMinY() + threshold > btf.getTopLeft().getY() && textBlock.getPdfMaxY() - threshold < btf.getTopLeft()
|
||||||
&& textBlock.getPdfMinY() + threshold > btf.getTopLeft().getY()
|
.getY() + btf.getHeight()) {
|
||||||
&& textBlock.getPdfMaxY() - threshold < btf.getTopLeft().getY() + btf.getHeight()) {
|
|
||||||
return true;
|
return true;
|
||||||
} else {
|
} else {
|
||||||
return false;
|
return false;
|
||||||
|
|||||||
@ -41,11 +41,14 @@ public class RectangleTransformations {
|
|||||||
|
|
||||||
return atomicTextBlocks.stream().flatMap(atomicTextBlock -> atomicTextBlock.getPositions().stream()).collect(new Rectangle2DBBoxCollector());
|
return atomicTextBlocks.stream().flatMap(atomicTextBlock -> atomicTextBlock.getPositions().stream()).collect(new Rectangle2DBBoxCollector());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
public static Collector<Rectangle2D, Rectangle2DBBoxCollector.BBox, Rectangle2D> collectBBox() {
|
public static Collector<Rectangle2D, Rectangle2DBBoxCollector.BBox, Rectangle2D> collectBBox() {
|
||||||
|
|
||||||
return new Rectangle2DBBoxCollector();
|
return new Rectangle2DBBoxCollector();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
public static PDRectangle toPDRectangleBBox(List<Rectangle> rectangles) {
|
public static PDRectangle toPDRectangleBBox(List<Rectangle> rectangles) {
|
||||||
|
|
||||||
Rectangle2D rectangle2D = RectangleTransformations.rectangleBBox(rectangles);
|
Rectangle2D rectangle2D = RectangleTransformations.rectangleBBox(rectangles);
|
||||||
@ -70,6 +73,7 @@ public class RectangleTransformations {
|
|||||||
return format("%f,%f,%f,%f", rectangle2D.getX(), rectangle2D.getY(), rectangle2D.getWidth(), rectangle2D.getHeight());
|
return format("%f,%f,%f,%f", rectangle2D.getX(), rectangle2D.getY(), rectangle2D.getWidth(), rectangle2D.getHeight());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
public static Rectangle2D rectangleBBox(List<Rectangle> rectangles) {
|
public static Rectangle2D rectangleBBox(List<Rectangle> rectangles) {
|
||||||
|
|
||||||
return rectangles.stream().map(RectangleTransformations::toRectangle2D).collect(new Rectangle2DBBoxCollector());
|
return rectangles.stream().map(RectangleTransformations::toRectangle2D).collect(new Rectangle2DBBoxCollector());
|
||||||
@ -84,6 +88,7 @@ public class RectangleTransformations {
|
|||||||
-redactionLogRectangle.getHeight());
|
-redactionLogRectangle.getHeight());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
public static Rectangle2D toRectangle2D(PDRectangle rectangle) {
|
public static Rectangle2D toRectangle2D(PDRectangle rectangle) {
|
||||||
|
|
||||||
return new Rectangle2D.Double(rectangle.getLowerLeftX(), rectangle.getLowerLeftY(), rectangle.getWidth(), rectangle.getHeight());
|
return new Rectangle2D.Double(rectangle.getLowerLeftX(), rectangle.getLowerLeftY(), rectangle.getWidth(), rectangle.getHeight());
|
||||||
|
|||||||
@ -3,7 +3,6 @@ package com.knecon.fforesight.service.layoutparser.processor.utils;
|
|||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.stream.Collectors;
|
import java.util.stream.Collectors;
|
||||||
|
|
||||||
|
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
|
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
|
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
|
||||||
|
|
||||||
|
|||||||
@ -28,15 +28,13 @@ import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPosit
|
|||||||
*
|
*
|
||||||
* @author Ben Litchfield
|
* @author Ben Litchfield
|
||||||
*/
|
*/
|
||||||
public class TextPositionSequenceComparator implements Comparator<TextPositionSequence>
|
public class TextPositionSequenceComparator implements Comparator<TextPositionSequence> {
|
||||||
{
|
|
||||||
@Override
|
@Override
|
||||||
public int compare(TextPositionSequence pos1, TextPositionSequence pos2)
|
public int compare(TextPositionSequence pos1, TextPositionSequence pos2) {
|
||||||
{
|
|
||||||
// only compare text that is in the same direction
|
// only compare text that is in the same direction
|
||||||
int cmp1 = Float.compare(pos1.getDir().getDegrees(), pos2.getDir().getDegrees());
|
int cmp1 = Float.compare(pos1.getDir().getDegrees(), pos2.getDir().getDegrees());
|
||||||
if (cmp1 != 0)
|
if (cmp1 != 0) {
|
||||||
{
|
|
||||||
return cmp1;
|
return cmp1;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -54,19 +52,13 @@ public class TextPositionSequenceComparator implements Comparator<TextPositionSe
|
|||||||
float yDifference = Math.abs(pos1YBottom - pos2YBottom);
|
float yDifference = Math.abs(pos1YBottom - pos2YBottom);
|
||||||
|
|
||||||
// we will do a simple tolerance comparison
|
// we will do a simple tolerance comparison
|
||||||
if (yDifference < .1 ||
|
if (yDifference < .1 || pos2YBottom >= pos1YTop && pos2YBottom <= pos1YBottom || pos1YBottom >= pos2YTop && pos1YBottom <= pos2YBottom) {
|
||||||
pos2YBottom >= pos1YTop && pos2YBottom <= pos1YBottom ||
|
|
||||||
pos1YBottom >= pos2YTop && pos1YBottom <= pos2YBottom)
|
|
||||||
{
|
|
||||||
return Float.compare(x1, x2);
|
return Float.compare(x1, x2);
|
||||||
}
|
} else if (pos1YBottom < pos2YBottom) {
|
||||||
else if (pos1YBottom < pos2YBottom)
|
|
||||||
{
|
|
||||||
return -1;
|
return -1;
|
||||||
}
|
} else {
|
||||||
else
|
|
||||||
{
|
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|||||||
@ -14,7 +14,7 @@ import com.knecon.fforesight.service.layoutparser.server.queue.MessagingConfigur
|
|||||||
import com.knecon.fforesight.tenantcommons.MultiTenancyAutoConfiguration;
|
import com.knecon.fforesight.tenantcommons.MultiTenancyAutoConfiguration;
|
||||||
|
|
||||||
@ImportAutoConfiguration({MultiTenancyAutoConfiguration.class})
|
@ImportAutoConfiguration({MultiTenancyAutoConfiguration.class})
|
||||||
@Import({MetricsConfiguration.class, StorageAutoConfiguration.class, LayoutParsingServiceProcessorConfiguration.class, MessagingConfiguration.class})
|
@Import({MetricsConfiguration.class, StorageAutoConfiguration.class, LayoutParsingServiceProcessorConfiguration.class, MessagingConfiguration.class})
|
||||||
@SpringBootApplication(exclude = {SecurityAutoConfiguration.class, ManagementWebSecurityAutoConfiguration.class})
|
@SpringBootApplication(exclude = {SecurityAutoConfiguration.class, ManagementWebSecurityAutoConfiguration.class})
|
||||||
public class Application {
|
public class Application {
|
||||||
|
|
||||||
|
|||||||
@ -26,6 +26,7 @@ import com.knecon.fforesight.service.layoutparser.processor.LayoutParsingPipelin
|
|||||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document;
|
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlock;
|
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlock;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.image.ImageServiceResponse;
|
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.image.ImageServiceResponse;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableExtractorResponse;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableServiceResponse;
|
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableServiceResponse;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.services.factory.DocumentGraphFactory;
|
import com.knecon.fforesight.service.layoutparser.processor.services.factory.DocumentGraphFactory;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.services.mapper.TaasDocumentDataMapper;
|
import com.knecon.fforesight.service.layoutparser.processor.services.mapper.TaasDocumentDataMapper;
|
||||||
@ -50,7 +51,7 @@ public class BdrJsonBuildTest extends AbstractTest {
|
|||||||
return DocumentGraphFactory.buildDocumentGraph(layoutParsingPipeline.parseLayout(LayoutParsingType.TAAS,
|
return DocumentGraphFactory.buildDocumentGraph(layoutParsingPipeline.parseLayout(LayoutParsingType.TAAS,
|
||||||
pdDocument,
|
pdDocument,
|
||||||
new ImageServiceResponse(),
|
new ImageServiceResponse(),
|
||||||
new TableServiceResponse()));
|
new TableServiceResponse(), new TableExtractorResponse()));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@ -33,6 +33,7 @@ import com.knecon.fforesight.service.layoutparser.processor.LayoutParsingPipelin
|
|||||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document;
|
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.SemanticNode;
|
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.SemanticNode;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.image.ImageServiceResponse;
|
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.image.ImageServiceResponse;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableExtractorResponse;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableServiceResponse;
|
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableServiceResponse;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.services.factory.DocumentGraphFactory;
|
import com.knecon.fforesight.service.layoutparser.processor.services.factory.DocumentGraphFactory;
|
||||||
import com.knecon.fforesight.tenantcommons.TenantsClient;
|
import com.knecon.fforesight.tenantcommons.TenantsClient;
|
||||||
@ -98,7 +99,7 @@ public class HeadlinesGoldStandardIntegrationTest {
|
|||||||
Document documentGraph = DocumentGraphFactory.buildDocumentGraph(layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER,
|
Document documentGraph = DocumentGraphFactory.buildDocumentGraph(layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER,
|
||||||
Loader.loadPDF(pdfFileResource.getFile()),
|
Loader.loadPDF(pdfFileResource.getFile()),
|
||||||
new ImageServiceResponse(),
|
new ImageServiceResponse(),
|
||||||
new TableServiceResponse()));
|
new TableServiceResponse(), new TableExtractorResponse()));
|
||||||
|
|
||||||
var foundHeadlines = documentGraph.streamAllSubNodes()
|
var foundHeadlines = documentGraph.streamAllSubNodes()
|
||||||
.map(SemanticNode::getHeadline)
|
.map(SemanticNode::getHeadline)
|
||||||
|
|||||||
@ -16,6 +16,7 @@ import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.Do
|
|||||||
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType;
|
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document;
|
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.image.ImageServiceResponse;
|
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.image.ImageServiceResponse;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableExtractorResponse;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableServiceResponse;
|
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableServiceResponse;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.services.factory.DocumentGraphFactory;
|
import com.knecon.fforesight.service.layoutparser.processor.services.factory.DocumentGraphFactory;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.services.mapper.DocumentDataMapper;
|
import com.knecon.fforesight.service.layoutparser.processor.services.mapper.DocumentDataMapper;
|
||||||
@ -58,7 +59,7 @@ public class DocumentGraphJsonWritingTest extends BuildDocumentTest {
|
|||||||
Document documentGraph = DocumentGraphFactory.buildDocumentGraph(layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER,
|
Document documentGraph = DocumentGraphFactory.buildDocumentGraph(layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER,
|
||||||
Loader.loadPDF(filename.toFile()),
|
Loader.loadPDF(filename.toFile()),
|
||||||
new ImageServiceResponse(),
|
new ImageServiceResponse(),
|
||||||
new TableServiceResponse()));
|
new TableServiceResponse(), new TableExtractorResponse()));
|
||||||
|
|
||||||
DocumentData documentData = DocumentDataMapper.toDocumentData(documentGraph);
|
DocumentData documentData = DocumentDataMapper.toDocumentData(documentGraph);
|
||||||
ObjectMapper mapper = ObjectMapperFactory.create();
|
ObjectMapper mapper = ObjectMapperFactory.create();
|
||||||
|
|||||||
@ -1,27 +1,53 @@
|
|||||||
package com.knecon.fforesight.service.layoutparser.server.graph;
|
package com.knecon.fforesight.service.layoutparser.server.graph;
|
||||||
|
|
||||||
|
import static com.knecon.fforesight.service.layoutparser.server.utils.visualizations.PdfDraw.drawRectangle2DList;
|
||||||
|
|
||||||
|
import java.awt.Color;
|
||||||
|
import java.awt.geom.Rectangle2D;
|
||||||
|
import java.awt.geom.RectangularShape;
|
||||||
|
import java.io.File;
|
||||||
import java.io.FileOutputStream;
|
import java.io.FileOutputStream;
|
||||||
|
import java.io.InputStream;
|
||||||
|
import java.nio.file.Files;
|
||||||
import java.nio.file.Path;
|
import java.nio.file.Path;
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.HashMap;
|
||||||
|
import java.util.LinkedList;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
import java.util.Map;
|
||||||
|
|
||||||
import org.apache.pdfbox.Loader;
|
import org.apache.pdfbox.Loader;
|
||||||
import org.apache.pdfbox.pdmodel.PDDocument;
|
import org.apache.pdfbox.pdmodel.PDDocument;
|
||||||
|
import org.json.JSONArray;
|
||||||
|
import org.json.JSONObject;
|
||||||
import org.junit.jupiter.api.Disabled;
|
import org.junit.jupiter.api.Disabled;
|
||||||
import org.junit.jupiter.api.Test;
|
import org.junit.jupiter.api.Test;
|
||||||
import org.springframework.beans.factory.annotation.Autowired;
|
import org.springframework.beans.factory.annotation.Autowired;
|
||||||
import org.springframework.core.io.ClassPathResource;
|
import org.springframework.core.io.ClassPathResource;
|
||||||
|
|
||||||
|
import com.fasterxml.jackson.databind.JsonNode;
|
||||||
|
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentData;
|
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentData;
|
||||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentStructure;
|
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentStructure;
|
||||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeType;
|
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeType;
|
||||||
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType;
|
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationDocument;
|
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationDocument;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.model.GapInformation;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.model.LineInformation;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document;
|
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Table;
|
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Table;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.Cell;
|
import com.knecon.fforesight.service.layoutparser.processor.model.table.Cell;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.model.table.Rectangle;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.TablePageBlock;
|
import com.knecon.fforesight.service.layoutparser.processor.model.table.TablePageBlock;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.image.ImageServiceResponse;
|
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.image.ImageServiceResponse;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.ExtractedTable;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableCells;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableExtractorResponse;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableServiceResponse;
|
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableServiceResponse;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.services.GapDetectionService;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.services.GapsAcrossLinesService;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.services.LineDetectionService;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.services.SectionsBuilderService;
|
import com.knecon.fforesight.service.layoutparser.processor.services.SectionsBuilderService;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.services.classification.RedactManagerClassificationService;
|
import com.knecon.fforesight.service.layoutparser.processor.services.classification.RedactManagerClassificationService;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.services.factory.DocumentGraphFactory;
|
import com.knecon.fforesight.service.layoutparser.processor.services.factory.DocumentGraphFactory;
|
||||||
@ -29,7 +55,9 @@ import com.knecon.fforesight.service.layoutparser.processor.services.mapper.Docu
|
|||||||
import com.knecon.fforesight.service.layoutparser.processor.services.mapper.PropertiesMapper;
|
import com.knecon.fforesight.service.layoutparser.processor.services.mapper.PropertiesMapper;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.services.visualization.LayoutGridService;
|
import com.knecon.fforesight.service.layoutparser.processor.services.visualization.LayoutGridService;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.services.visualization.ViewerDocumentService;
|
import com.knecon.fforesight.service.layoutparser.processor.services.visualization.ViewerDocumentService;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.utils.PdfVisualisationUtility;
|
||||||
import com.knecon.fforesight.service.layoutparser.server.utils.BuildDocumentTest;
|
import com.knecon.fforesight.service.layoutparser.server.utils.BuildDocumentTest;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.server.utils.visualizations.PdfDraw;
|
||||||
|
|
||||||
import lombok.SneakyThrows;
|
import lombok.SneakyThrows;
|
||||||
|
|
||||||
@ -41,6 +69,9 @@ public class ViewerDocumentTest extends BuildDocumentTest {
|
|||||||
@Autowired
|
@Autowired
|
||||||
private RedactManagerClassificationService redactManagerClassificationService;
|
private RedactManagerClassificationService redactManagerClassificationService;
|
||||||
|
|
||||||
|
@Autowired
|
||||||
|
private ObjectMapper objectMapper;
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
@SneakyThrows
|
@SneakyThrows
|
||||||
public void testViewerDocument() {
|
public void testViewerDocument() {
|
||||||
@ -51,16 +82,69 @@ public class ViewerDocumentTest extends BuildDocumentTest {
|
|||||||
ViewerDocumentService viewerDocumentService = new ViewerDocumentService(layoutGridService);
|
ViewerDocumentService viewerDocumentService = new ViewerDocumentService(layoutGridService);
|
||||||
Document document = buildGraph(fileName, LayoutParsingType.TAAS);
|
Document document = buildGraph(fileName, LayoutParsingType.TAAS);
|
||||||
try (var pdDocument = Loader.loadPDF(new ClassPathResource(fileName).getFile()); var out = new FileOutputStream(tmpFileName)) {
|
try (var pdDocument = Loader.loadPDF(new ClassPathResource(fileName).getFile()); var out = new FileOutputStream(tmpFileName)) {
|
||||||
viewerDocumentService.createViewerDocument(pdDocument, document, out, true);
|
viewerDocumentService.createViewerDocument(pdDocument, document, out, null,true);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
@SneakyThrows
|
||||||
|
public void testLayoutParsingServiceResults() {
|
||||||
|
String tableSourceFileName ="C:\\Users\\YannikHampe\\Downloads\\b28d9a22b674906813f12b86dda33202.EXTRACTED_TABLES.json\\b28d9a22b674906813f12b86dda33202.EXTRACTED_TABLES.json";
|
||||||
|
Path pdfFileResource = Path.of("C:\\Users\\YannikHampe\\Downloads\\2009-1048395_50pages_tables.pdf");
|
||||||
|
String tmpFileName = "C:/Users/YANNIK~1/AppData/Local/Temp/rectangles."+pdfFileResource.getFileName();
|
||||||
|
PDDocument pdDocument = Loader.loadPDF(pdfFileResource.toFile());
|
||||||
|
|
||||||
|
try (InputStream inputStream = Files.newInputStream(Path.of(tableSourceFileName))) {
|
||||||
|
TableExtractorResponse tableExtractorResponse = objectMapper.readValue(inputStream, TableExtractorResponse.class);
|
||||||
|
tableExtractorResponse.getData().forEach(data -> {
|
||||||
|
List<TableCells> tableCells = convertTableCells(data.getTables());
|
||||||
|
});
|
||||||
|
inputStream.close();
|
||||||
|
}
|
||||||
|
try (var out = new FileOutputStream(tmpFileName)) {
|
||||||
|
pdDocument.save(out);
|
||||||
|
pdDocument.close();
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
public List<TableCells> convertTableCells(List<ExtractedTable> tableObjects) {
|
||||||
|
|
||||||
|
List<TableCells> parsedTableCells = new ArrayList<>();
|
||||||
|
|
||||||
|
tableObjects.stream().forEach(t -> {
|
||||||
|
System.out.println(t.getTable().getLabel());
|
||||||
|
TableCells tableCells = new TableCells();
|
||||||
|
tableCells.setX0(t.getTable().getBbox().get(0));
|
||||||
|
tableCells.setX1(t.getTable().getBbox().get(2));
|
||||||
|
tableCells.setY0(t.getTable().getBbox().get(1));
|
||||||
|
tableCells.setY1(t.getTable().getBbox().get(3));
|
||||||
|
tableCells.setWidth(tableCells.getX1()- tableCells.getX0());
|
||||||
|
tableCells.setHeight(tableCells.getY1()- tableCells.getY0());
|
||||||
|
parsedTableCells.add(tableCells);
|
||||||
|
t.getObjects().forEach(o -> {
|
||||||
|
System.out.println(o.getLabel());
|
||||||
|
TableCells objectCell = new TableCells();
|
||||||
|
objectCell.setX0(t.getTable().getBbox().get(0));
|
||||||
|
objectCell.setX1(t.getTable().getBbox().get(2));
|
||||||
|
objectCell.setY0(t.getTable().getBbox().get(1));
|
||||||
|
objectCell.setY1(t.getTable().getBbox().get(3));
|
||||||
|
objectCell.setWidth(objectCell.getX1()- objectCell.getX0());
|
||||||
|
objectCell.setHeight(objectCell.getY1()- objectCell.getY0());
|
||||||
|
parsedTableCells.add(objectCell);
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
|
return parsedTableCells;
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
public ClassificationDocument buildClassificationDocument(PDDocument originDocument) {
|
public ClassificationDocument buildClassificationDocument(PDDocument originDocument) {
|
||||||
|
|
||||||
ClassificationDocument classificationDocument = layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER,
|
ClassificationDocument classificationDocument = layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER,
|
||||||
originDocument,
|
originDocument,
|
||||||
new ImageServiceResponse(),
|
new ImageServiceResponse(),
|
||||||
new TableServiceResponse());
|
new TableServiceResponse(), new TableExtractorResponse());
|
||||||
|
|
||||||
redactManagerClassificationService.classifyDocument(classificationDocument);
|
redactManagerClassificationService.classifyDocument(classificationDocument);
|
||||||
|
|
||||||
|
|||||||
@ -33,6 +33,7 @@ import com.knecon.fforesight.service.layoutparser.processor.model.table.TablePag
|
|||||||
import com.knecon.fforesight.service.layoutparser.processor.python_api.adapter.CvTableParsingAdapter;
|
import com.knecon.fforesight.service.layoutparser.processor.python_api.adapter.CvTableParsingAdapter;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.python_api.adapter.ImageServiceResponseAdapter;
|
import com.knecon.fforesight.service.layoutparser.processor.python_api.adapter.ImageServiceResponseAdapter;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.image.ImageServiceResponse;
|
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.image.ImageServiceResponse;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableExtractorResponse;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableServiceResponse;
|
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableServiceResponse;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.services.SectionsBuilderService;
|
import com.knecon.fforesight.service.layoutparser.processor.services.SectionsBuilderService;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.services.classification.RedactManagerClassificationService;
|
import com.knecon.fforesight.service.layoutparser.processor.services.classification.RedactManagerClassificationService;
|
||||||
@ -67,7 +68,7 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
|||||||
ClassificationDocument classificationDocument = layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER,
|
ClassificationDocument classificationDocument = layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER,
|
||||||
originDocument,
|
originDocument,
|
||||||
new ImageServiceResponse(),
|
new ImageServiceResponse(),
|
||||||
tableServiceResponse);
|
tableServiceResponse, new TableExtractorResponse());
|
||||||
|
|
||||||
redactManagerClassificationService.classifyDocument(classificationDocument);
|
redactManagerClassificationService.classifyDocument(classificationDocument);
|
||||||
|
|
||||||
|
|||||||
@ -20,6 +20,7 @@ import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Do
|
|||||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Table;
|
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Table;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings;
|
import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.image.ImageServiceResponse;
|
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.image.ImageServiceResponse;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableExtractorResponse;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableServiceResponse;
|
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableServiceResponse;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.services.PageContentExtractor;
|
import com.knecon.fforesight.service.layoutparser.processor.services.PageContentExtractor;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.services.RulingCleaningService;
|
import com.knecon.fforesight.service.layoutparser.processor.services.RulingCleaningService;
|
||||||
@ -79,11 +80,11 @@ public class RulingCleaningServiceTest extends BuildDocumentTest {
|
|||||||
Document documentGraphBefore = DocumentGraphFactory.buildDocumentGraph(layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER,
|
Document documentGraphBefore = DocumentGraphFactory.buildDocumentGraph(layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER,
|
||||||
Loader.loadPDF(filename.toFile()),
|
Loader.loadPDF(filename.toFile()),
|
||||||
new ImageServiceResponse(),
|
new ImageServiceResponse(),
|
||||||
new TableServiceResponse()));
|
new TableServiceResponse(), new TableExtractorResponse()));
|
||||||
Document documentGraphAfter = DocumentGraphFactory.buildDocumentGraph(layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER,
|
Document documentGraphAfter = DocumentGraphFactory.buildDocumentGraph(layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER,
|
||||||
Loader.loadPDF(filename.toFile()),
|
Loader.loadPDF(filename.toFile()),
|
||||||
new ImageServiceResponse(),
|
new ImageServiceResponse(),
|
||||||
new TableServiceResponse()));
|
new TableServiceResponse(), new TableExtractorResponse()));
|
||||||
DocumentData documentDataBefore = DocumentDataMapper.toDocumentData(documentGraphBefore);
|
DocumentData documentDataBefore = DocumentDataMapper.toDocumentData(documentGraphBefore);
|
||||||
DocumentData documentDataAfter = DocumentDataMapper.toDocumentData(documentGraphAfter);
|
DocumentData documentDataAfter = DocumentDataMapper.toDocumentData(documentGraphAfter);
|
||||||
if (!compareStructures(documentDataBefore.getDocumentStructure(), documentDataAfter.getDocumentStructure(), filename.getFileName().toString())) {
|
if (!compareStructures(documentDataBefore.getDocumentStructure(), documentDataAfter.getDocumentStructure(), filename.getFileName().toString())) {
|
||||||
|
|||||||
@ -26,6 +26,8 @@ import org.springframework.test.context.junit.jupiter.SpringExtension;
|
|||||||
import java.io.InputStream;
|
import java.io.InputStream;
|
||||||
import java.util.Optional;
|
import java.util.Optional;
|
||||||
|
|
||||||
|
import javax.swing.text.html.Option;
|
||||||
|
|
||||||
@ExtendWith(SpringExtension.class)
|
@ExtendWith(SpringExtension.class)
|
||||||
@SpringBootTest(classes = Application.class, webEnvironment = SpringBootTest.WebEnvironment.RANDOM_PORT)
|
@SpringBootTest(classes = Application.class, webEnvironment = SpringBootTest.WebEnvironment.RANDOM_PORT)
|
||||||
@Import(AbstractTest.TestConfiguration.class)
|
@Import(AbstractTest.TestConfiguration.class)
|
||||||
@ -46,6 +48,8 @@ public abstract class AbstractTest {
|
|||||||
protected final static String ORIGIN_FILE_ID = "origin";
|
protected final static String ORIGIN_FILE_ID = "origin";
|
||||||
protected final static String TABLE_FILE_ID = "table";
|
protected final static String TABLE_FILE_ID = "table";
|
||||||
protected final static String IMAGE_FILE_ID = "image";
|
protected final static String IMAGE_FILE_ID = "image";
|
||||||
|
|
||||||
|
protected final static String TABLE_EXTRACTOR_FILE_ID = "extractedTable";
|
||||||
protected final static String STRUCTURE_FILE_ID = "structure";
|
protected final static String STRUCTURE_FILE_ID = "structure";
|
||||||
protected final static String TEXT_FILE_ID = "texts";
|
protected final static String TEXT_FILE_ID = "texts";
|
||||||
protected final static String POSITION_FILE_ID = "positions";
|
protected final static String POSITION_FILE_ID = "positions";
|
||||||
@ -62,6 +66,7 @@ public abstract class AbstractTest {
|
|||||||
.originFileStorageId(ORIGIN_FILE_ID)
|
.originFileStorageId(ORIGIN_FILE_ID)
|
||||||
.tablesFileStorageId(Optional.of(TABLE_FILE_ID))
|
.tablesFileStorageId(Optional.of(TABLE_FILE_ID))
|
||||||
.imagesFileStorageId(Optional.of(IMAGE_FILE_ID))
|
.imagesFileStorageId(Optional.of(IMAGE_FILE_ID))
|
||||||
|
.tableExtractorFileId(Optional.of(TABLE_EXTRACTOR_FILE_ID))
|
||||||
.structureFileStorageId(STRUCTURE_FILE_ID)
|
.structureFileStorageId(STRUCTURE_FILE_ID)
|
||||||
.textBlockFileStorageId(TEXT_FILE_ID)
|
.textBlockFileStorageId(TEXT_FILE_ID)
|
||||||
.positionBlockFileStorageId(POSITION_FILE_ID)
|
.positionBlockFileStorageId(POSITION_FILE_ID)
|
||||||
@ -89,7 +94,7 @@ public abstract class AbstractTest {
|
|||||||
@SneakyThrows
|
@SneakyThrows
|
||||||
protected LayoutParsingRequest prepareStorage(String file) {
|
protected LayoutParsingRequest prepareStorage(String file) {
|
||||||
|
|
||||||
return prepareStorage(file, "cv_table_parsing_response/empty.json", "image_service_response/empty.json");
|
return prepareStorage(file, "cv_table_parsing_response/empty.json", "image_service_response/empty.json","table_extractor_response/empty.json");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@ -107,6 +112,7 @@ public abstract class AbstractTest {
|
|||||||
.originFileStorageId(ORIGIN_FILE_ID)
|
.originFileStorageId(ORIGIN_FILE_ID)
|
||||||
.tablesFileStorageId(Optional.of(TABLE_FILE_ID))
|
.tablesFileStorageId(Optional.of(TABLE_FILE_ID))
|
||||||
.imagesFileStorageId(Optional.of(IMAGE_FILE_ID))
|
.imagesFileStorageId(Optional.of(IMAGE_FILE_ID))
|
||||||
|
.tableExtractorFileId(Optional.of(TABLE_EXTRACTOR_FILE_ID))
|
||||||
.structureFileStorageId(STRUCTURE_FILE_ID)
|
.structureFileStorageId(STRUCTURE_FILE_ID)
|
||||||
.textBlockFileStorageId(TEXT_FILE_ID)
|
.textBlockFileStorageId(TEXT_FILE_ID)
|
||||||
.positionBlockFileStorageId(POSITION_FILE_ID)
|
.positionBlockFileStorageId(POSITION_FILE_ID)
|
||||||
@ -117,21 +123,23 @@ public abstract class AbstractTest {
|
|||||||
}
|
}
|
||||||
|
|
||||||
@SneakyThrows
|
@SneakyThrows
|
||||||
protected LayoutParsingRequest prepareStorage(String file, String cvServiceResponseFile, String imageInfoFile) {
|
protected LayoutParsingRequest prepareStorage(String file, String cvServiceResponseFile, String imageInfoFile, String tableExtractorResponseFile) {
|
||||||
|
|
||||||
ClassPathResource pdfFileResource = new ClassPathResource(file);
|
ClassPathResource pdfFileResource = new ClassPathResource(file);
|
||||||
ClassPathResource cvServiceResponseFileResource = new ClassPathResource(cvServiceResponseFile);
|
ClassPathResource cvServiceResponseFileResource = new ClassPathResource(cvServiceResponseFile);
|
||||||
ClassPathResource imageInfoFileResource = new ClassPathResource(imageInfoFile);
|
ClassPathResource imageInfoFileResource = new ClassPathResource(imageInfoFile);
|
||||||
|
ClassPathResource tableExtractorResponseFileRessource = new ClassPathResource(tableExtractorResponseFile);
|
||||||
|
|
||||||
return prepareStorage(pdfFileResource.getInputStream(), cvServiceResponseFileResource.getInputStream(), imageInfoFileResource.getInputStream());
|
return prepareStorage(pdfFileResource.getInputStream(), cvServiceResponseFileResource.getInputStream(), imageInfoFileResource.getInputStream(),tableExtractorResponseFileRessource.getInputStream());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@SneakyThrows
|
@SneakyThrows
|
||||||
protected LayoutParsingRequest prepareStorage(InputStream fileStream, InputStream cvServiceResponseFileStream, InputStream imageInfoStream) {
|
protected LayoutParsingRequest prepareStorage(InputStream fileStream, InputStream cvServiceResponseFileStream, InputStream imageInfoStream, InputStream tableExtractorResponseFileStream) {
|
||||||
|
|
||||||
storageService.storeObject(TenantContext.getTenantId(), IMAGE_FILE_ID, imageInfoStream);
|
storageService.storeObject(TenantContext.getTenantId(), IMAGE_FILE_ID, imageInfoStream);
|
||||||
storageService.storeObject(TenantContext.getTenantId(), TABLE_FILE_ID, cvServiceResponseFileStream);
|
storageService.storeObject(TenantContext.getTenantId(), TABLE_FILE_ID, cvServiceResponseFileStream);
|
||||||
|
storageService.storeObject(TenantContext.getTenantId(), TABLE_EXTRACTOR_FILE_ID, tableExtractorResponseFileStream);
|
||||||
storageService.storeObject(TenantContext.getTenantId(), ORIGIN_FILE_ID, fileStream);
|
storageService.storeObject(TenantContext.getTenantId(), ORIGIN_FILE_ID, fileStream);
|
||||||
|
|
||||||
return buildDefaultLayoutParsingRequest(LayoutParsingType.REDACT_MANAGER);
|
return buildDefaultLayoutParsingRequest(LayoutParsingType.REDACT_MANAGER);
|
||||||
|
|||||||
@ -11,6 +11,7 @@ import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsi
|
|||||||
import com.knecon.fforesight.service.layoutparser.processor.LayoutParsingPipeline;
|
import com.knecon.fforesight.service.layoutparser.processor.LayoutParsingPipeline;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationDocument;
|
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationDocument;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document;
|
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableExtractorResponse;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableServiceResponse;
|
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableServiceResponse;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.services.factory.DocumentGraphFactory;
|
import com.knecon.fforesight.service.layoutparser.processor.services.factory.DocumentGraphFactory;
|
||||||
|
|
||||||
@ -28,7 +29,7 @@ public abstract class BuildDocumentTest extends AbstractTest {
|
|||||||
ClassPathResource fileResource = new ClassPathResource(filename);
|
ClassPathResource fileResource = new ClassPathResource(filename);
|
||||||
prepareStorage(filename);
|
prepareStorage(filename);
|
||||||
try (InputStream inputStream = fileResource.getInputStream(); PDDocument pdDocument = Loader.loadPDF(inputStream.readAllBytes())) {
|
try (InputStream inputStream = fileResource.getInputStream(); PDDocument pdDocument = Loader.loadPDF(inputStream.readAllBytes())) {
|
||||||
return layoutParsingPipeline.parseLayout(layoutParsingType, pdDocument, layoutParsingStorageService.getImagesFile(IMAGE_FILE_ID), new TableServiceResponse());
|
return layoutParsingPipeline.parseLayout(layoutParsingType, pdDocument, layoutParsingStorageService.getImagesFile(IMAGE_FILE_ID), new TableServiceResponse(), new TableExtractorResponse());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -44,7 +45,7 @@ public abstract class BuildDocumentTest extends AbstractTest {
|
|||||||
protected Document buildGraph(String filename, LayoutParsingType layoutParsingType) {
|
protected Document buildGraph(String filename, LayoutParsingType layoutParsingType) {
|
||||||
|
|
||||||
if (filename.equals("files/S-Metolachlor_RAR_01_Volume_1_2018-09-06.pdf")) {
|
if (filename.equals("files/S-Metolachlor_RAR_01_Volume_1_2018-09-06.pdf")) {
|
||||||
prepareStorage(filename, "cv_table_parsing_response/empty.json", "image_service_response/S-Metolachlor_RAR_01_Volume_1_2018-09-06.IMAGE_INFO.json");
|
prepareStorage(filename, "cv_table_parsing_response/empty.json", "image_service_response/S-Metolachlor_RAR_01_Volume_1_2018-09-06.IMAGE_INFO.json","table_extractor_response/empty.json");
|
||||||
} else {
|
} else {
|
||||||
prepareStorage(filename);
|
prepareStorage(filename);
|
||||||
}
|
}
|
||||||
|
|||||||
@ -0,0 +1,8 @@
|
|||||||
|
{
|
||||||
|
"dossierId": "123",
|
||||||
|
"fileId": "123",
|
||||||
|
"operation": "table",
|
||||||
|
"targetFileExtension": "ORIGIN.pdf.gz",
|
||||||
|
"responseFileExtension": "TABLES.json.gz",
|
||||||
|
"extractedTableData": []
|
||||||
|
}
|
||||||
@ -4,5 +4,5 @@ gradle assemble
|
|||||||
|
|
||||||
buildNumber=${1:-1}
|
buildNumber=${1:-1}
|
||||||
|
|
||||||
gradle bootBuildImage --cleanCache --publishImage -PbuildbootDockerHostNetwork=true -Pversion=$USER-$buildNumber
|
gradle bootBuildImage --cleanCache --publishImage -Pversion=layout-parser-yannik-$buildNumber --stacktrace
|
||||||
echo "nexus.knecon.com:5001/red/${dir}-server-v1:$USER-$buildNumber"
|
echo "nexus.knecon.com:5001/red/${dir}-server-v1:table-extractor-yannik-$buildNumber"
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user