Compare commits
13 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
aa1e77dda3 | ||
|
|
0c3b910088 | ||
|
|
b4e5f2da2f | ||
|
|
b47b187c8a | ||
|
|
0d4800622d | ||
|
|
cc492bc50d | ||
|
|
1c3c5385e1 | ||
|
|
bce2558133 | ||
|
|
2200574b6d | ||
|
|
775c943f7e | ||
|
|
b7fe6fd3c4 | ||
|
|
3dd447ebef | ||
|
|
f4e93ef03b |
@ -20,6 +20,9 @@ public record LayoutParsingRequest(
|
||||
@NonNull String originFileStorageId,//
|
||||
@Schema(description = "Optional Path to the table extraction file.")//
|
||||
Optional<String> tablesFileStorageId,//
|
||||
|
||||
@Schema(description= "Optional Path to the the table parsing service file")
|
||||
Optional<String> tableExtractorFileId,
|
||||
@Schema(description = "Optional Path to the image classification file.")//
|
||||
Optional<String> imagesFileStorageId,//
|
||||
|
||||
|
||||
@ -30,8 +30,11 @@ import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageB
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.python_api.adapter.CvTableParsingAdapter;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.python_api.adapter.ImageServiceResponseAdapter;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.python_api.adapter.TableExtractorResponseAdapter;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.image.ImageServiceResponse;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableCells;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableExtractorCells;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableExtractorResponse;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableServiceResponse;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.BodyTextFrameService;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.RulingCleaningService;
|
||||
@ -63,6 +66,7 @@ public class LayoutParsingPipeline {
|
||||
private final ImageServiceResponseAdapter imageServiceResponseAdapter;
|
||||
private final CvTableParsingAdapter cvTableParsingAdapter;
|
||||
private final LayoutParsingStorageService layoutParsingStorageService;
|
||||
private final TableExtractorResponseAdapter tableExtractorResponseAdapter;
|
||||
private final SectionsBuilderService sectionsBuilderService;
|
||||
private final TaasClassificationService taasClassificationService;
|
||||
private final RedactManagerClassificationService redactManagerClassificationService;
|
||||
@ -87,12 +91,21 @@ public class LayoutParsingPipeline {
|
||||
imageServiceResponse = layoutParsingStorageService.getImagesFile(layoutParsingRequest.imagesFileStorageId().get());
|
||||
}
|
||||
|
||||
TableExtractorResponse tableExtractorResponse = new TableExtractorResponse();
|
||||
if (layoutParsingRequest.tableExtractorFileId().isPresent()) {
|
||||
tableExtractorResponse = layoutParsingStorageService.getExtractedTableFile(layoutParsingRequest.tableExtractorFileId().get());
|
||||
}
|
||||
|
||||
TableServiceResponse tableServiceResponse = new TableServiceResponse();
|
||||
if (layoutParsingRequest.tablesFileStorageId().isPresent()) {
|
||||
tableServiceResponse = layoutParsingStorageService.getTablesFile(layoutParsingRequest.tablesFileStorageId().get());
|
||||
}
|
||||
|
||||
ClassificationDocument classificationDocument = parseLayout(layoutParsingRequest.layoutParsingType(), originDocument, imageServiceResponse, tableServiceResponse);
|
||||
ClassificationDocument classificationDocument = parseLayout(layoutParsingRequest.layoutParsingType(),
|
||||
originDocument,
|
||||
imageServiceResponse,
|
||||
tableServiceResponse,
|
||||
tableExtractorResponse);
|
||||
Document documentGraph = DocumentGraphFactory.buildDocumentGraph(classificationDocument);
|
||||
|
||||
int numberOfPages = originDocument.getNumberOfPages();
|
||||
@ -100,8 +113,9 @@ public class LayoutParsingPipeline {
|
||||
layoutParsingStorageService.storeDocumentData(layoutParsingRequest, DocumentDataMapper.toDocumentData(documentGraph));
|
||||
layoutParsingStorageService.storeSimplifiedText(layoutParsingRequest, simplifiedSectionTextService.toSimplifiedText(documentGraph));
|
||||
|
||||
Map<Integer, List<TableExtractorCells>> extractedTableCells = tableExtractorResponseAdapter.buildExtractedTablesPerPage(tableExtractorResponse);
|
||||
try (var out = new ByteArrayOutputStream()) {
|
||||
viewerDocumentService.createViewerDocument(originDocument, documentGraph, out, false);
|
||||
viewerDocumentService.createViewerDocument(originDocument, documentGraph, out,extractedTableCells ,false);
|
||||
layoutParsingStorageService.storeViewerDocument(layoutParsingRequest, out);
|
||||
}
|
||||
|
||||
@ -157,11 +171,12 @@ public class LayoutParsingPipeline {
|
||||
public ClassificationDocument parseLayout(LayoutParsingType layoutParsingType,
|
||||
PDDocument originDocument,
|
||||
ImageServiceResponse imageServiceResponse,
|
||||
TableServiceResponse tableServiceResponse) {
|
||||
TableServiceResponse tableServiceResponse,
|
||||
TableExtractorResponse tableExtractorResponse) {
|
||||
|
||||
Map<Integer, List<TableCells>> pdfTableCells = cvTableParsingAdapter.buildCvParsedTablesPerPage(tableServiceResponse);
|
||||
Map<Integer, List<ClassifiedImage>> pdfImages = imageServiceResponseAdapter.buildClassifiedImagesPerPage(imageServiceResponse);
|
||||
|
||||
Map<Integer, List<TableExtractorCells>> extractedTableCells = tableExtractorResponseAdapter.buildExtractedTablesPerPage(tableExtractorResponse);
|
||||
ClassificationDocument classificationDocument = new ClassificationDocument();
|
||||
List<ClassificationPage> classificationPages = new ArrayList<>();
|
||||
|
||||
@ -244,9 +259,9 @@ public class LayoutParsingPipeline {
|
||||
|
||||
private void increaseDocumentStatistics(ClassificationPage classificationPage, ClassificationDocument document) {
|
||||
|
||||
if (!classificationPage.isLandscape()) {
|
||||
document.getFontSizeCounter().addAll(classificationPage.getFontSizeCounter().getCountPerValue());
|
||||
}
|
||||
if (!classificationPage.isLandscape()) {
|
||||
document.getFontSizeCounter().addAll(classificationPage.getFontSizeCounter().getCountPerValue());
|
||||
}
|
||||
document.getFontCounter().addAll(classificationPage.getFontCounter().getCountPerValue());
|
||||
document.getTextHeightCounter().addAll(classificationPage.getTextHeightCounter().getCountPerValue());
|
||||
document.getFontStyleCounter().addAll(classificationPage.getFontStyleCounter().getCountPerValue());
|
||||
|
||||
@ -23,6 +23,7 @@ import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.Si
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.taas.ResearchDocumentData;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingRequest;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.image.ImageServiceResponse;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableExtractorResponse;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableServiceResponse;
|
||||
import com.knecon.fforesight.tenantcommons.TenantContext;
|
||||
|
||||
@ -63,6 +64,16 @@ public class LayoutParsingStorageService {
|
||||
}
|
||||
|
||||
|
||||
public TableExtractorResponse getExtractedTableFile(String storageId) throws IOException {
|
||||
|
||||
try (InputStream inputStream = getObject(storageId)) {
|
||||
TableExtractorResponse tableExtractorResponse = objectMapper.readValue(inputStream, TableExtractorResponse.class);
|
||||
inputStream.close();
|
||||
return tableExtractorResponse;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
public TableServiceResponse getTablesFile(String storageId) throws IOException {
|
||||
|
||||
try (var tableClassificationStream = getObject(storageId)) {
|
||||
@ -83,7 +94,6 @@ public class LayoutParsingStorageService {
|
||||
}
|
||||
|
||||
|
||||
|
||||
public void storeResearchDocumentData(LayoutParsingRequest layoutParsingRequest, ResearchDocumentData researchDocumentData) {
|
||||
|
||||
storageService.storeJSONObject(TenantContext.getTenantId(), layoutParsingRequest.researchDocumentStorageId(), researchDocumentData);
|
||||
|
||||
@ -14,7 +14,6 @@ import com.knecon.fforesight.service.layoutparser.processor.model.text.StringFre
|
||||
import lombok.Data;
|
||||
import lombok.NonNull;
|
||||
import lombok.RequiredArgsConstructor;
|
||||
import org.apache.pdfbox.pdmodel.documentinterchange.markedcontent.PDMarkedContent;
|
||||
|
||||
@Data
|
||||
@RequiredArgsConstructor
|
||||
|
||||
@ -19,4 +19,5 @@ public class PageContents {
|
||||
Rectangle2D cropBox;
|
||||
Rectangle2D mediaBox;
|
||||
List<Ruling> rulings;
|
||||
|
||||
}
|
||||
|
||||
@ -108,11 +108,13 @@ public class Boundary implements Comparable<Boundary> {
|
||||
return splitBoundaries;
|
||||
}
|
||||
|
||||
|
||||
public IntStream intStream() {
|
||||
|
||||
return IntStream.range(start, end);
|
||||
}
|
||||
|
||||
|
||||
public static Boundary merge(Collection<Boundary> boundaries) {
|
||||
|
||||
int minStart = boundaries.stream().mapToInt(Boundary::start).min().orElseThrow(IllegalArgumentException::new);
|
||||
|
||||
@ -105,6 +105,7 @@ public class Document implements GenericSemanticNode {
|
||||
return streamAllSubNodes().collect(Collectors.groupingBy(SemanticNode::getType, Collectors.counting()));
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
|
||||
|
||||
@ -207,6 +207,7 @@ public class Table implements SemanticNode {
|
||||
return IntStream.range(0, numberOfCols).boxed().map(col -> getCell(row, col));
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Streams all TableCells row-wise and filters them with header == true.
|
||||
*
|
||||
|
||||
@ -109,10 +109,7 @@ public class AtomicTextBlock implements TextBlock {
|
||||
}
|
||||
|
||||
|
||||
public static AtomicTextBlock fromAtomicTextBlockData(DocumentTextData documentTextData,
|
||||
DocumentPositionData documentPositionData,
|
||||
SemanticNode parent,
|
||||
Page page) {
|
||||
public static AtomicTextBlock fromAtomicTextBlockData(DocumentTextData documentTextData, DocumentPositionData documentPositionData, SemanticNode parent, Page page) {
|
||||
|
||||
return AtomicTextBlock.builder()
|
||||
.id(documentTextData.getId())
|
||||
|
||||
@ -1,14 +1,12 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.model.table;
|
||||
|
||||
import java.awt.geom.Point2D;
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collections;
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
import java.util.TreeMap;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.PageBlockType;
|
||||
@ -50,6 +48,7 @@ public class TablePageBlock extends AbstractPageBlock {
|
||||
return getColCount() == 0 || getRowCount() == 0;
|
||||
}
|
||||
|
||||
|
||||
public List<List<Cell>> getRows() {
|
||||
|
||||
if (rows == null) {
|
||||
@ -276,21 +275,17 @@ public class TablePageBlock extends AbstractPageBlock {
|
||||
}
|
||||
|
||||
|
||||
|
||||
public boolean intersects(Cell cell1, Cell cell2) {
|
||||
|
||||
if (cell1.getHeight() <= 0 || cell2.getHeight() <= 0) {
|
||||
return false;
|
||||
}
|
||||
double x0 = cell1.getX() + 2;
|
||||
double y0 = cell1.getY() + 2;
|
||||
return (cell2.x + cell2.width > x0 &&
|
||||
cell2.y + cell2.height > y0 &&
|
||||
cell2.x < x0 + cell1.getWidth() -2 &&
|
||||
cell2.y < y0 + cell1.getHeight() -2);
|
||||
return (cell2.x + cell2.width > x0 && cell2.y + cell2.height > y0 && cell2.x < x0 + cell1.getWidth() - 2 && cell2.y < y0 + cell1.getHeight() - 2);
|
||||
}
|
||||
|
||||
|
||||
|
||||
@Override
|
||||
public String getText() {
|
||||
|
||||
@ -328,8 +323,6 @@ public class TablePageBlock extends AbstractPageBlock {
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
public String getTextAsHtml() {
|
||||
|
||||
StringBuilder sb = new StringBuilder();
|
||||
|
||||
@ -2,6 +2,7 @@ package com.knecon.fforesight.service.layoutparser.processor.model.text;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.TextNormalizationUtilities;
|
||||
|
||||
import lombok.Getter;
|
||||
|
||||
@ -73,7 +73,7 @@ public class TextPageBlock extends AbstractPageBlock {
|
||||
|
||||
return sequences.get(0).getPageWidth();
|
||||
}
|
||||
|
||||
|
||||
|
||||
public static TextPageBlock merge(List<TextPageBlock> textBlocksToMerge) {
|
||||
|
||||
@ -82,6 +82,7 @@ public class TextPageBlock extends AbstractPageBlock {
|
||||
return fromTextPositionSequences(sequences);
|
||||
}
|
||||
|
||||
|
||||
public static TextPageBlock fromTextPositionSequences(List<TextPositionSequence> wordBlockList) {
|
||||
|
||||
TextPageBlock textBlock = null;
|
||||
@ -133,7 +134,6 @@ public class TextPageBlock extends AbstractPageBlock {
|
||||
}
|
||||
|
||||
|
||||
|
||||
/**
|
||||
* Returns the minX value in pdf coordinate system.
|
||||
* Note: This needs to use Pdf Coordinate System where {0,0} rotated with the page rotation.
|
||||
|
||||
@ -234,6 +234,7 @@ public class TextPositionSequence implements CharSequence {
|
||||
@JsonIgnore
|
||||
@JsonAttribute(ignore = true)
|
||||
public String getFontStyle() {
|
||||
|
||||
if (textPositions.get(0).getFontName() == null) {
|
||||
return "standard";
|
||||
}
|
||||
|
||||
@ -9,10 +9,10 @@ import java.util.Map;
|
||||
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.image.ImageServiceResponse;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.image.ClassifiedImage;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.ImageType;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.image.ClassifiedImage;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.image.ImageServiceResponse;
|
||||
|
||||
import lombok.RequiredArgsConstructor;
|
||||
|
||||
@ -20,8 +20,7 @@ import lombok.RequiredArgsConstructor;
|
||||
@RequiredArgsConstructor
|
||||
public class ImageServiceResponseAdapter {
|
||||
|
||||
|
||||
public Map<Integer, List<ClassifiedImage>> buildClassifiedImagesPerPage(ImageServiceResponse imageServiceResponse ) {
|
||||
public Map<Integer, List<ClassifiedImage>> buildClassifiedImagesPerPage(ImageServiceResponse imageServiceResponse) {
|
||||
|
||||
Map<Integer, List<ClassifiedImage>> images = new HashMap<>();
|
||||
imageServiceResponse.getData().forEach(imageMetadata -> {
|
||||
|
||||
@ -0,0 +1,74 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.python_api.adapter;
|
||||
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Locale;
|
||||
import java.util.Map;
|
||||
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.ImageType;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.image.ClassifiedImage;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.image.ImageServiceResponse;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.ExtractedTable;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.ExtractedTableData;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableCells;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableData;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableExtractorCells;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableExtractorResponse;
|
||||
|
||||
import lombok.RequiredArgsConstructor;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
|
||||
@Service
|
||||
@Slf4j
|
||||
@RequiredArgsConstructor
|
||||
public class TableExtractorResponseAdapter {
|
||||
|
||||
public Map<Integer, List<TableExtractorCells>> buildExtractedTablesPerPage(TableExtractorResponse tableExtractorResponse) {
|
||||
Map<Integer, List<TableExtractorCells>> tableCells = new HashMap<>();
|
||||
tableExtractorResponse.getData()
|
||||
.forEach(tableData -> tableCells.computeIfAbsent(tableData.getPage_number(), tableCell -> new ArrayList<>())
|
||||
.addAll(convertTableCells(tableData.getTables())));
|
||||
|
||||
return tableCells;
|
||||
|
||||
}
|
||||
|
||||
public List<TableExtractorCells> convertTableCells(List<ExtractedTable> tableObjects) {
|
||||
|
||||
List<TableExtractorCells> parsedTableCells = new ArrayList<>();
|
||||
|
||||
tableObjects.stream().forEach(t -> {
|
||||
TableExtractorCells tableCells = new TableExtractorCells();
|
||||
tableCells.setX0(t.getTable().getBbox().get(0));
|
||||
tableCells.setX1(t.getTable().getBbox().get(2));
|
||||
tableCells.setY0(t.getTable().getBbox().get(1));
|
||||
tableCells.setY1(t.getTable().getBbox().get(3));
|
||||
tableCells.setWidth(tableCells.getX1()- tableCells.getX0());
|
||||
tableCells.setHeight(tableCells.getY1()- tableCells.getY0());
|
||||
tableCells.setLabel(t.getTable().getLabel());
|
||||
log.info("Parsed table cell {} with label {}",tableCells, tableCells.getLabel());
|
||||
parsedTableCells.add(tableCells);
|
||||
t.getObjects().forEach(o -> {
|
||||
TableExtractorCells objectCell = new TableExtractorCells();
|
||||
objectCell.setX0(t.getTable().getBbox().get(0));
|
||||
objectCell.setX1(t.getTable().getBbox().get(2));
|
||||
objectCell.setY0(t.getTable().getBbox().get(1));
|
||||
objectCell.setY1(t.getTable().getBbox().get(3));
|
||||
objectCell.setWidth(objectCell.getX1()- objectCell.getX0());
|
||||
objectCell.setHeight(objectCell.getY1()- objectCell.getY0());
|
||||
objectCell.setLabel(o.getLabel());
|
||||
log.info("Parsed object cell {} with label {}",objectCell, objectCell.getLabel());
|
||||
parsedTableCells.add(objectCell);
|
||||
});
|
||||
});
|
||||
|
||||
return parsedTableCells;
|
||||
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
@ -0,0 +1,20 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.python_api.model.table;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Builder;
|
||||
import lombok.Data;
|
||||
import lombok.NoArgsConstructor;
|
||||
|
||||
@Data
|
||||
@Builder
|
||||
@NoArgsConstructor
|
||||
@AllArgsConstructor
|
||||
public class ExtractedTable {
|
||||
|
||||
private boolean rotated;
|
||||
private ExtractedTableData table;
|
||||
private List<ExtractedTableData> objects;
|
||||
|
||||
}
|
||||
@ -0,0 +1,20 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.python_api.model.table;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Builder;
|
||||
import lombok.Data;
|
||||
import lombok.NoArgsConstructor;
|
||||
|
||||
@Data
|
||||
@Builder
|
||||
@NoArgsConstructor
|
||||
@AllArgsConstructor
|
||||
public class ExtractedTableData {
|
||||
|
||||
private String label;
|
||||
private float score;
|
||||
private List<Float> bbox;
|
||||
|
||||
}
|
||||
@ -0,0 +1,21 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.python_api.model.table;
|
||||
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Builder;
|
||||
import lombok.Data;
|
||||
import lombok.NoArgsConstructor;
|
||||
|
||||
@Data
|
||||
@Builder
|
||||
@NoArgsConstructor
|
||||
@AllArgsConstructor
|
||||
public class TableExtractorCells {
|
||||
private float x0;
|
||||
private float y0;
|
||||
private float x1;
|
||||
private float y1;
|
||||
private float width;
|
||||
private float height;
|
||||
private String label;
|
||||
|
||||
}
|
||||
@ -0,0 +1,19 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.python_api.model.table;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Builder;
|
||||
import lombok.Data;
|
||||
import lombok.NoArgsConstructor;
|
||||
|
||||
@Data
|
||||
@Builder
|
||||
@NoArgsConstructor
|
||||
@AllArgsConstructor
|
||||
public class TableExtractorData {
|
||||
|
||||
private int page_number;
|
||||
private int image;
|
||||
private List<ExtractedTable> tables;
|
||||
}
|
||||
@ -0,0 +1,23 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.python_api.model.table;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Builder;
|
||||
import lombok.Data;
|
||||
import lombok.NoArgsConstructor;
|
||||
|
||||
@Data
|
||||
@Builder
|
||||
@NoArgsConstructor
|
||||
@AllArgsConstructor
|
||||
public class TableExtractorResponse {
|
||||
|
||||
private String dossierId;
|
||||
private String fileId;
|
||||
private String targetFileExtension;
|
||||
private String responseFileExtension;
|
||||
private String X_TENANT_ID;
|
||||
private List<TableExtractorData> data;
|
||||
|
||||
}
|
||||
@ -25,6 +25,7 @@ public class BodyTextFrameService {
|
||||
private static final float RULING_HEIGHT_THRESHOLD = 0.15f; // multiplied with page height. Header/Footer Rulings must be within that border of the page.
|
||||
private static final float RULING_WIDTH_THRESHOLD = 0.75f; // multiplied with page width. Header/Footer Rulings must be at least that wide.
|
||||
|
||||
|
||||
public void setBodyTextFrames(ClassificationDocument classificationDocument, LayoutParsingType layoutParsingType) {
|
||||
|
||||
Rectangle bodyTextFrame = calculateBodyTextFrame(classificationDocument.getPages(), classificationDocument.getFontSizeCounter(), false, layoutParsingType);
|
||||
@ -155,8 +156,9 @@ public class BodyTextFrameService {
|
||||
continue;
|
||||
}
|
||||
|
||||
if (MarkedContentUtils.intersects(textBlock, page.getMarkedContentBboxPerType(), MarkedContentUtils.HEADER)
|
||||
|| MarkedContentUtils.intersects(textBlock, page.getMarkedContentBboxPerType(), MarkedContentUtils.FOOTER)) {
|
||||
if (MarkedContentUtils.intersects(textBlock, page.getMarkedContentBboxPerType(), MarkedContentUtils.HEADER) || MarkedContentUtils.intersects(textBlock,
|
||||
page.getMarkedContentBboxPerType(),
|
||||
MarkedContentUtils.FOOTER)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
|
||||
@ -22,7 +22,6 @@ public class DividingColumnDetectionService {
|
||||
|
||||
public List<Rectangle2D> detectColumns(PageContents pageContents) {
|
||||
|
||||
|
||||
if (pageContents.getSortedTextPositionSequences().size() < 2) {
|
||||
return List.of(pageContents.getCropBox());
|
||||
}
|
||||
|
||||
@ -72,11 +72,13 @@ public class GapDetectionService {
|
||||
return mirrorY(RectangleTransformations.toRectangle2D(textPosition.getRectangle()));
|
||||
}
|
||||
|
||||
|
||||
private static Rectangle2D mirrorY(Rectangle2D rectangle2D) {
|
||||
|
||||
return new Rectangle2D.Double(rectangle2D.getX(), Math.min(rectangle2D.getMinY(), rectangle2D.getMaxY()), rectangle2D.getWidth(), Math.abs(rectangle2D.getHeight()));
|
||||
}
|
||||
|
||||
|
||||
private static void addGapToLine(Rectangle2D currentTextPosition, Rectangle2D previousTextPosition, XGapsContext context) {
|
||||
|
||||
context.gapsInCurrentLine.add(new Rectangle2D.Double(previousTextPosition.getMaxX(),
|
||||
|
||||
@ -6,7 +6,6 @@ import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
import java.util.Queue;
|
||||
import java.util.stream.Stream;
|
||||
import com.iqser.red.commons.jackson.ObjectMapperFactory;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.GapInformation;
|
||||
|
||||
@ -51,7 +50,9 @@ public class GapsAcrossLinesService {
|
||||
}
|
||||
|
||||
return columnFactory.outputGaps.stream()
|
||||
.filter(gapAcrossLines -> columnFactory.outputGaps.stream().filter(gapAcrossLines::intersectsX).noneMatch(gapAcrossLines1 -> gapAcrossLines1.lineCount > gapAcrossLines.lineCount))
|
||||
.filter(gapAcrossLines -> columnFactory.outputGaps.stream()
|
||||
.filter(gapAcrossLines::intersectsX)
|
||||
.noneMatch(gapAcrossLines1 -> gapAcrossLines1.lineCount > gapAcrossLines.lineCount))
|
||||
.filter(gapAcrossLines -> Math.abs(gapAcrossLines.rectangle2D.getMinX() - mainBodyTextFrame.getMinX()) > DISTANCE_TO_BORDER_THRESHOLD)
|
||||
.filter(gapAcrossLines -> Math.abs(gapAcrossLines.rectangle2D.getMaxX() - mainBodyTextFrame.getMaxX()) > DISTANCE_TO_BORDER_THRESHOLD)
|
||||
.map(GapAcrossLines::getRectangle2D)
|
||||
|
||||
@ -6,8 +6,8 @@ import java.util.List;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.GapInformation;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.LineInformation;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations;
|
||||
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Getter;
|
||||
|
||||
@ -16,8 +16,7 @@ public class MainBodyTextFrameExtractionService {
|
||||
|
||||
public Rectangle2D calculateMainBodyTextFrame(LineInformation lineInformation) {
|
||||
|
||||
Rectangle2D mainBodyTextFrame = lineInformation.getLineBBox().stream()
|
||||
.collect(RectangleTransformations.collectBBox());
|
||||
Rectangle2D mainBodyTextFrame = lineInformation.getLineBBox().stream().collect(RectangleTransformations.collectBBox());
|
||||
|
||||
return RectangleTransformations.pad(mainBodyTextFrame, mainBodyTextFrame.getWidth() * TEXT_FRAME_PAD_WIDTH, mainBodyTextFrame.getHeight() * TEXT_FRAME_PAD_HEIGHT);
|
||||
}
|
||||
|
||||
@ -52,7 +52,7 @@ public class PageContentExtractor {
|
||||
stripper.getRulings()));
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
return textPositionSequencesPerPage;
|
||||
}
|
||||
|
||||
|
||||
@ -5,9 +5,9 @@ import java.util.List;
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.SimplifiedSectionText;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.SimplifiedText;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Section;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.SimplifiedText;
|
||||
|
||||
@Service
|
||||
public class SimplifiedSectionTextService {
|
||||
@ -23,4 +23,5 @@ public class SimplifiedSectionTextService {
|
||||
|
||||
return SimplifiedSectionText.builder().sectionNumber(section.getTreeId().get(0)).text(section.getTextBlock().getSearchText()).build();
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@ -1,9 +1,20 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.services.blockification;
|
||||
|
||||
|
||||
// TODO: figure out, why this fails the build
|
||||
// import static com.knecon.fforesight.service.layoutparser.processor.services.factory.SearchTextWithTextPositionFactory.HEIGHT_PADDING;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashSet;
|
||||
import java.util.Iterator;
|
||||
import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
import java.util.regex.Matcher;
|
||||
import java.util.regex.Pattern;
|
||||
import java.util.stream.Stream;
|
||||
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.Orientation;
|
||||
@ -11,12 +22,6 @@ import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.RulingTextDirAdjustUtil;
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import java.util.*;
|
||||
import java.util.regex.Matcher;
|
||||
import java.util.regex.Pattern;
|
||||
import java.util.stream.Stream;
|
||||
|
||||
@Service
|
||||
@SuppressWarnings("all")
|
||||
@ -83,13 +88,13 @@ public class TaasBlockificationService {
|
||||
continue;
|
||||
}
|
||||
|
||||
|
||||
Matcher listIdentifierPattern = listIdentifier.matcher(currentTextBlock.getText());
|
||||
boolean isListIdentifier = listIdentifierPattern.find();
|
||||
|
||||
boolean yGap = Math.abs(currentTextBlock.getPdfMaxY() - previousTextBlock.getPdfMinY()) < previousTextBlock.getMostPopularWordHeight() * Y_GAP_SPLIT_HEIGHT_MODIFIER;
|
||||
|
||||
boolean sameFont = previousTextBlock.getMostPopularWordFont().equals(currentTextBlock.getMostPopularWordFont()) && previousTextBlock.getMostPopularWordFontSize() == currentTextBlock.getMostPopularWordFontSize();
|
||||
boolean sameFont = previousTextBlock.getMostPopularWordFont()
|
||||
.equals(currentTextBlock.getMostPopularWordFont()) && previousTextBlock.getMostPopularWordFontSize() == currentTextBlock.getMostPopularWordFontSize();
|
||||
// boolean yGap = previousTextBlock != null && currentTextBlock.getMinYDirAdj() - maxY > Math.min(word.getHeight(), prev.getHeight()) * Y_GAP_SPLIT_HEIGHT_MODIFIER;
|
||||
|
||||
boolean alignsXRight = Math.abs(currentTextBlock.getPdfMaxX() - previousTextBlock.getPdfMaxX()) < X_ALIGNMENT_THRESHOLD;
|
||||
@ -119,8 +124,9 @@ public class TaasBlockificationService {
|
||||
}
|
||||
alreadyMerged.add(textPageBlock);
|
||||
textBlocksToMerge.add(Stream.concat(Stream.of(textPageBlock),
|
||||
textPageBlocks.stream().filter(textPageBlock2 -> textPageBlock.almostIntersects(textPageBlock2, INTERSECTS_Y_THRESHOLD, 0) && !alreadyMerged.contains(textPageBlock2)).peek(alreadyMerged::add))
|
||||
.toList());
|
||||
textPageBlocks.stream()
|
||||
.filter(textPageBlock2 -> textPageBlock.almostIntersects(textPageBlock2, INTERSECTS_Y_THRESHOLD, 0) && !alreadyMerged.contains(textPageBlock2))
|
||||
.peek(alreadyMerged::add)).toList());
|
||||
}
|
||||
return textBlocksToMerge.stream().map(TextPageBlock::merge).toList();
|
||||
}
|
||||
@ -163,8 +169,7 @@ public class TaasBlockificationService {
|
||||
while (itty.hasNext()) {
|
||||
TextPageBlock block = (TextPageBlock) itty.next();
|
||||
|
||||
if (previous != null && previous.getOrientation().equals(Orientation.LEFT) && block.getOrientation().equals(Orientation.LEFT) && equalsWithThreshold(
|
||||
block.getMaxY(),
|
||||
if (previous != null && previous.getOrientation().equals(Orientation.LEFT) && block.getOrientation().equals(Orientation.LEFT) && equalsWithThreshold(block.getMaxY(),
|
||||
previous.getMaxY()) || previous != null && previous.getOrientation().equals(Orientation.LEFT) && block.getOrientation()
|
||||
.equals(Orientation.RIGHT) && equalsWithThreshold(block.getMaxY(), previous.getMaxY())) {
|
||||
previous.add(block);
|
||||
@ -189,7 +194,6 @@ public class TaasBlockificationService {
|
||||
TextPositionSequence prev = null;
|
||||
// TODO: make static final constant
|
||||
|
||||
|
||||
boolean wasSplitted = false;
|
||||
Float splitX1 = null;
|
||||
for (TextPositionSequence word : textPositions) {
|
||||
|
||||
@ -5,7 +5,6 @@ import java.util.Locale;
|
||||
import java.util.regex.Matcher;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.MarkedContentUtils;
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
|
||||
@ -13,6 +12,7 @@ import com.knecon.fforesight.service.layoutparser.processor.model.Classification
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.PageBlockType;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.MarkedContentUtils;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.PositionUtils;
|
||||
|
||||
import lombok.RequiredArgsConstructor;
|
||||
@ -63,16 +63,16 @@ public class DocuMineClassificationService {
|
||||
textBlock.setClassification(PageBlockType.OTHER);
|
||||
return;
|
||||
}
|
||||
if (MarkedContentUtils.intersects(textBlock, page.getMarkedContentBboxPerType(), MarkedContentUtils.HEADER)
|
||||
|| PositionUtils.isOverBodyTextFrame(bodyTextFrame, textBlock, page.getRotation()) && (document.getFontSizeCounter()
|
||||
.getMostPopular() == null || textBlock.getHighestFontSize() <= document.getFontSizeCounter().getMostPopular())
|
||||
) {
|
||||
if (MarkedContentUtils.intersects(textBlock, page.getMarkedContentBboxPerType(), MarkedContentUtils.HEADER) || PositionUtils.isOverBodyTextFrame(bodyTextFrame,
|
||||
textBlock,
|
||||
page.getRotation()) && (document.getFontSizeCounter().getMostPopular() == null || textBlock.getHighestFontSize() <= document.getFontSizeCounter()
|
||||
.getMostPopular())) {
|
||||
textBlock.setClassification(PageBlockType.HEADER);
|
||||
|
||||
} else if (MarkedContentUtils.intersects(textBlock, page.getMarkedContentBboxPerType(), MarkedContentUtils.FOOTER)
|
||||
|| PositionUtils.isUnderBodyTextFrame(bodyTextFrame, textBlock, page.getRotation()) && (document.getFontSizeCounter()
|
||||
.getMostPopular() == null || textBlock.getHighestFontSize() <= document.getFontSizeCounter().getMostPopular())
|
||||
) {
|
||||
} else if (MarkedContentUtils.intersects(textBlock, page.getMarkedContentBboxPerType(), MarkedContentUtils.FOOTER) || PositionUtils.isUnderBodyTextFrame(bodyTextFrame,
|
||||
textBlock,
|
||||
page.getRotation()) && (document.getFontSizeCounter().getMostPopular() == null || textBlock.getHighestFontSize() <= document.getFontSizeCounter()
|
||||
.getMostPopular())) {
|
||||
textBlock.setClassification(PageBlockType.FOOTER);
|
||||
} else if (page.getPageNumber() == 1 && (PositionUtils.getHeightDifferenceBetweenChunkWordAndDocumentWord(textBlock,
|
||||
document.getTextHeightCounter().getMostPopular()) > 2.5 && textBlock.getHighestFontSize() > document.getFontSizeCounter().getMostPopular() || page.getTextBlocks()
|
||||
|
||||
@ -3,7 +3,6 @@ package com.knecon.fforesight.service.layoutparser.processor.services.classifica
|
||||
import java.util.List;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.MarkedContentUtils;
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
|
||||
@ -11,6 +10,7 @@ import com.knecon.fforesight.service.layoutparser.processor.model.Classification
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.PageBlockType;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.MarkedContentUtils;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.PositionUtils;
|
||||
|
||||
import lombok.RequiredArgsConstructor;
|
||||
@ -21,7 +21,6 @@ import lombok.extern.slf4j.Slf4j;
|
||||
@RequiredArgsConstructor
|
||||
public class RedactManagerClassificationService {
|
||||
|
||||
|
||||
public void classifyDocument(ClassificationDocument document) {
|
||||
|
||||
List<Float> headlineFontSizes = document.getFontSizeCounter().getHighterThanMostPopular();
|
||||
@ -52,14 +51,16 @@ public class RedactManagerClassificationService {
|
||||
textBlock.setClassification(PageBlockType.OTHER);
|
||||
return;
|
||||
}
|
||||
if (MarkedContentUtils.intersects(textBlock, page.getMarkedContentBboxPerType(), MarkedContentUtils.HEADER)
|
||||
|| PositionUtils.isOverBodyTextFrame(bodyTextFrame, textBlock, page.getRotation()) && (document.getFontSizeCounter()
|
||||
.getMostPopular() == null || textBlock.getHighestFontSize() <= document.getFontSizeCounter().getMostPopular())) {
|
||||
if (MarkedContentUtils.intersects(textBlock, page.getMarkedContentBboxPerType(), MarkedContentUtils.HEADER) || PositionUtils.isOverBodyTextFrame(bodyTextFrame,
|
||||
textBlock,
|
||||
page.getRotation()) && (document.getFontSizeCounter().getMostPopular() == null || textBlock.getHighestFontSize() <= document.getFontSizeCounter()
|
||||
.getMostPopular())) {
|
||||
textBlock.setClassification(PageBlockType.HEADER);
|
||||
|
||||
} else if (MarkedContentUtils.intersects(textBlock, page.getMarkedContentBboxPerType(), MarkedContentUtils.FOOTER)
|
||||
|| PositionUtils.isUnderBodyTextFrame(bodyTextFrame, textBlock, page.getRotation()) && (document.getFontSizeCounter()
|
||||
.getMostPopular() == null || textBlock.getHighestFontSize() <= document.getFontSizeCounter().getMostPopular())) {
|
||||
} else if (MarkedContentUtils.intersects(textBlock, page.getMarkedContentBboxPerType(), MarkedContentUtils.FOOTER) || PositionUtils.isUnderBodyTextFrame(bodyTextFrame,
|
||||
textBlock,
|
||||
page.getRotation()) && (document.getFontSizeCounter().getMostPopular() == null || textBlock.getHighestFontSize() <= document.getFontSizeCounter()
|
||||
.getMostPopular())) {
|
||||
textBlock.setClassification(PageBlockType.FOOTER);
|
||||
} else if (page.getPageNumber() == 1 && (PositionUtils.getHeightDifferenceBetweenChunkWordAndDocumentWord(textBlock,
|
||||
document.getTextHeightCounter().getMostPopular()) > 2.5 && textBlock.getHighestFontSize() > document.getFontSizeCounter().getMostPopular() || page.getTextBlocks()
|
||||
|
||||
@ -3,7 +3,6 @@ package com.knecon.fforesight.service.layoutparser.processor.services.classifica
|
||||
import java.util.List;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.MarkedContentUtils;
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
|
||||
@ -12,6 +11,7 @@ import com.knecon.fforesight.service.layoutparser.processor.model.Classification
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.PageBlockType;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.BodyTextFrameService;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.MarkedContentUtils;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.PositionUtils;
|
||||
|
||||
import lombok.RequiredArgsConstructor;
|
||||
@ -27,7 +27,6 @@ public class TaasClassificationService {
|
||||
|
||||
public void classifyDocument(ClassificationDocument document) {
|
||||
|
||||
|
||||
List<Float> headlineFontSizes = document.getFontSizeCounter().getHighterThanMostPopular();
|
||||
|
||||
log.debug("Document FontSize counters are: {}", document.getFontSizeCounter().getCountPerValue());
|
||||
@ -57,11 +56,13 @@ public class TaasClassificationService {
|
||||
textBlock.setClassification(PageBlockType.OTHER);
|
||||
return;
|
||||
}
|
||||
if (MarkedContentUtils.intersects(textBlock, page.getMarkedContentBboxPerType(), MarkedContentUtils.HEADER)
|
||||
|| PositionUtils.isOverBodyTextFrame(bodyTextFrame, textBlock, page.getRotation())) {
|
||||
if (MarkedContentUtils.intersects(textBlock, page.getMarkedContentBboxPerType(), MarkedContentUtils.HEADER) || PositionUtils.isOverBodyTextFrame(bodyTextFrame,
|
||||
textBlock,
|
||||
page.getRotation())) {
|
||||
textBlock.setClassification(PageBlockType.HEADER);
|
||||
} else if (MarkedContentUtils.intersects(textBlock, page.getMarkedContentBboxPerType(), MarkedContentUtils.FOOTER)
|
||||
|| PositionUtils.isUnderBodyTextFrame(bodyTextFrame, textBlock, page.getRotation())) {
|
||||
} else if (MarkedContentUtils.intersects(textBlock, page.getMarkedContentBboxPerType(), MarkedContentUtils.FOOTER) || PositionUtils.isUnderBodyTextFrame(bodyTextFrame,
|
||||
textBlock,
|
||||
page.getRotation())) {
|
||||
textBlock.setClassification(PageBlockType.FOOTER);
|
||||
} else if (page.getPageNumber() == 1 && (PositionUtils.getHeightDifferenceBetweenChunkWordAndDocumentWord(textBlock,
|
||||
document.getTextHeightCounter().getMostPopular()) > 2.5 && textBlock.getHighestFontSize() > document.getFontSizeCounter().getMostPopular() || page.getTextBlocks()
|
||||
|
||||
@ -18,8 +18,6 @@ import com.knecon.fforesight.service.layoutparser.processor.model.Classification
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationFooter;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationHeader;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.image.ClassifiedImage;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.DocumentTree;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Footer;
|
||||
@ -31,6 +29,8 @@ import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Pa
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Paragraph;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Section;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.AtomicTextBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.image.ClassifiedImage;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.IdBuilder;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.TextPositionOperations;
|
||||
|
||||
|
||||
@ -8,10 +8,10 @@ import java.util.List;
|
||||
import java.util.Locale;
|
||||
import java.util.Objects;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.Boundary;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.RedTextPosition;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextDirection;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.Boundary;
|
||||
|
||||
import lombok.experimental.UtilityClass;
|
||||
|
||||
@ -110,6 +110,7 @@ public class SearchTextWithTextPositionFactory {
|
||||
return context.stringIdx - context.lastHyphenIdx < MAX_HYPHEN_LINEBREAK_DISTANCE;
|
||||
}
|
||||
|
||||
|
||||
private static List<Boundary> mergeToBoundaries(List<Integer> integers) {
|
||||
|
||||
if (integers.isEmpty()) {
|
||||
@ -125,8 +126,9 @@ public class SearchTextWithTextPositionFactory {
|
||||
}
|
||||
end = current + 1;
|
||||
}
|
||||
if (boundaries.isEmpty())
|
||||
if (boundaries.isEmpty()) {
|
||||
boundaries.add(new Boundary(start, end));
|
||||
}
|
||||
return boundaries;
|
||||
}
|
||||
|
||||
@ -138,6 +140,7 @@ public class SearchTextWithTextPositionFactory {
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
private boolean isLineBreak(RedTextPosition currentTextPosition, RedTextPosition previousTextPosition) {
|
||||
|
||||
return Objects.equals(currentTextPosition.getUnicode(), "\n") || isDeltaYLargerThanTextHeight(currentTextPosition, previousTextPosition);
|
||||
@ -163,17 +166,7 @@ public class SearchTextWithTextPositionFactory {
|
||||
|
||||
private boolean isHyphen(String unicodeCharacter) {
|
||||
|
||||
return Objects.equals(unicodeCharacter, "-") || //
|
||||
Objects.equals(unicodeCharacter, "~") || //
|
||||
Objects.equals(unicodeCharacter, "‐") || //
|
||||
Objects.equals(unicodeCharacter, "‒") || //
|
||||
Objects.equals(unicodeCharacter, "⁻") || //
|
||||
Objects.equals(unicodeCharacter, "−") || //
|
||||
Objects.equals(unicodeCharacter, "﹣") || //
|
||||
Objects.equals(unicodeCharacter, "゠") || //
|
||||
Objects.equals(unicodeCharacter, "⁓") || //
|
||||
Objects.equals(unicodeCharacter, "‑") || //
|
||||
Objects.equals(unicodeCharacter, "\u00AD");
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
|
||||
@ -11,12 +11,12 @@ import java.util.Map;
|
||||
import java.util.Set;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.image.ClassifiedImage;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.TablePageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.GenericSemanticNode;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Page;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Section;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.image.ClassifiedImage;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.TablePageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.TableMergingUtility;
|
||||
|
||||
import lombok.experimental.UtilityClass;
|
||||
|
||||
@ -8,15 +8,15 @@ import java.util.Set;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.Cell;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.TablePageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.GenericSemanticNode;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Page;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.SemanticNode;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Table;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.TableCell;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.Cell;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.TablePageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.TextPositionOperations;
|
||||
|
||||
import lombok.experimental.UtilityClass;
|
||||
|
||||
@ -2,10 +2,10 @@ package com.knecon.fforesight.service.layoutparser.processor.services.factory;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Page;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.SemanticNode;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.AtomicTextBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
|
||||
|
||||
import lombok.AccessLevel;
|
||||
import lombok.experimental.FieldDefaults;
|
||||
|
||||
@ -7,11 +7,11 @@ import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.NoSuchElementException;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentPositionData;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentTextData;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentData;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentStructure;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentPage;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentPositionData;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentStructure;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentTextData;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.DocumentTree;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Footer;
|
||||
|
||||
@ -1,7 +1,6 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.services.mapper;
|
||||
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.util.Collections;
|
||||
import java.util.HashMap;
|
||||
import java.util.Locale;
|
||||
import java.util.Map;
|
||||
@ -9,7 +8,6 @@ import java.util.Map;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentStructure;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Image;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.ImageType;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Page;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Table;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.TableCell;
|
||||
|
||||
|
||||
@ -329,6 +329,7 @@ public class PDFLinesTextStripper extends PDFTextStripper {
|
||||
.getXDirAdj() - (previous.getXDirAdj() + previous.getWidthDirAdj()) < maximumGapSize;
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public String getText(PDDocument doc) throws IOException {
|
||||
|
||||
|
||||
@ -25,10 +25,23 @@ import java.io.StringWriter;
|
||||
import java.io.Writer;
|
||||
import java.text.Bidi;
|
||||
import java.text.Normalizer;
|
||||
import java.util.*;
|
||||
import java.util.ArrayDeque;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collections;
|
||||
import java.util.Comparator;
|
||||
import java.util.Deque;
|
||||
import java.util.HashMap;
|
||||
import java.util.Iterator;
|
||||
import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.SortedMap;
|
||||
import java.util.SortedSet;
|
||||
import java.util.StringTokenizer;
|
||||
import java.util.TreeMap;
|
||||
import java.util.TreeSet;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
import lombok.Getter;
|
||||
import org.apache.commons.logging.Log;
|
||||
import org.apache.commons.logging.LogFactory;
|
||||
import org.apache.pdfbox.cos.COSDictionary;
|
||||
@ -46,6 +59,8 @@ import org.apache.pdfbox.text.TextPositionComparator;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.QuickSort;
|
||||
|
||||
import lombok.Getter;
|
||||
|
||||
/**
|
||||
* This is just a copy except i only adjusted lines 594-607 cause this is a bug in Pdfbox.
|
||||
* see S416.pdf
|
||||
@ -194,40 +209,33 @@ public class PDFTextStripper extends LegacyPDFStreamEngine {
|
||||
}
|
||||
|
||||
|
||||
public void beginMarkedContentSequence(COSName tag, COSDictionary properties) {
|
||||
|
||||
public void beginMarkedContentSequence(COSName tag, COSDictionary properties)
|
||||
{
|
||||
PDMarkedContent markedContent = PDMarkedContent.create(tag, properties);
|
||||
if (this.currentMarkedContents.isEmpty())
|
||||
{
|
||||
if (this.currentMarkedContents.isEmpty()) {
|
||||
this.markedContents.add(markedContent);
|
||||
}
|
||||
else
|
||||
{
|
||||
PDMarkedContent currentMarkedContent =
|
||||
this.currentMarkedContents.peek();
|
||||
if (currentMarkedContent != null)
|
||||
{
|
||||
} else {
|
||||
PDMarkedContent currentMarkedContent = this.currentMarkedContents.peek();
|
||||
if (currentMarkedContent != null) {
|
||||
currentMarkedContent.addMarkedContent(markedContent);
|
||||
}
|
||||
}
|
||||
this.currentMarkedContents.push(markedContent);
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public void endMarkedContentSequence()
|
||||
{
|
||||
if (!this.currentMarkedContents.isEmpty())
|
||||
{
|
||||
public void endMarkedContentSequence() {
|
||||
|
||||
if (!this.currentMarkedContents.isEmpty()) {
|
||||
this.currentMarkedContents.pop();
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
public void xobject(PDXObject xobject)
|
||||
{
|
||||
if (!this.currentMarkedContents.isEmpty())
|
||||
{
|
||||
public void xobject(PDXObject xobject) {
|
||||
|
||||
if (!this.currentMarkedContents.isEmpty()) {
|
||||
this.currentMarkedContents.peek().addXObject(xobject);
|
||||
}
|
||||
}
|
||||
@ -635,7 +643,6 @@ public class PDFTextStripper extends LegacyPDFStreamEngine {
|
||||
var normalized = normalize(line);
|
||||
// normalized.stream().filter(l -> System.out.println(l.getText().contains("Plenarprotokoll 20/24")).findFirst().isPresent()
|
||||
|
||||
|
||||
lastLineStartPosition = handleLineSeparation(current, lastPosition, lastLineStartPosition, maxHeightForLine);
|
||||
writeLine(normalized, current.isParagraphStart);
|
||||
line.clear();
|
||||
@ -914,8 +921,7 @@ public class PDFTextStripper extends LegacyPDFStreamEngine {
|
||||
textList.add(text);
|
||||
}
|
||||
}
|
||||
if (!this.currentMarkedContents.isEmpty())
|
||||
{
|
||||
if (!this.currentMarkedContents.isEmpty()) {
|
||||
this.currentMarkedContents.peek().addText(text);
|
||||
}
|
||||
}
|
||||
@ -2102,7 +2108,9 @@ public class PDFTextStripper extends LegacyPDFStreamEngine {
|
||||
return endParagraphWritten;
|
||||
}
|
||||
|
||||
public void setEndParagraphWritten(){
|
||||
|
||||
public void setEndParagraphWritten() {
|
||||
|
||||
endParagraphWritten = true;
|
||||
}
|
||||
|
||||
@ -2145,7 +2153,6 @@ public class PDFTextStripper extends LegacyPDFStreamEngine {
|
||||
this.isHangingIndent = true;
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@ -1,10 +1,13 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.services.visualization;
|
||||
|
||||
import java.awt.Color;
|
||||
import java.awt.geom.AffineTransform;
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.io.IOException;
|
||||
import java.io.OutputStream;
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
|
||||
import org.apache.pdfbox.cos.COSDictionary;
|
||||
@ -30,6 +33,7 @@ import com.knecon.fforesight.service.layoutparser.processor.model.visualization.
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.visualization.LayoutGrid;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.visualization.PlacedText;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.visualization.VisualizationsOnPage;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableExtractorCells;
|
||||
|
||||
import lombok.RequiredArgsConstructor;
|
||||
import lombok.SneakyThrows;
|
||||
@ -40,7 +44,6 @@ import lombok.extern.slf4j.Slf4j;
|
||||
@RequiredArgsConstructor
|
||||
public class ViewerDocumentService {
|
||||
|
||||
|
||||
private static final String LAYER_NAME = "Layout grid";
|
||||
private static final int FONT_SIZE = 10;
|
||||
public static final float LINE_WIDTH = 1f;
|
||||
@ -49,13 +52,18 @@ public class ViewerDocumentService {
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
public void createViewerDocument(PDDocument pdDocument, Document document, OutputStream outputStream, boolean layerVisibilityDefaultValue) {
|
||||
public void createViewerDocument(PDDocument pdDocument,
|
||||
Document document,
|
||||
OutputStream outputStream,
|
||||
Map<Integer,List<TableExtractorCells>> extractedTableCells,
|
||||
boolean layerVisibilityDefaultValue) {
|
||||
|
||||
LayoutGrid layoutGrid = layoutGridService.createLayoutGrid(document);
|
||||
// PDDocument.save() is very slow, since it actually traverses the entire pdf and writes a new one.
|
||||
// If we collect all COSDictionaries we changed and tell it explicitly to only add the changed ones by using saveIncremental it's very fast.
|
||||
Set<COSDictionary> dictionariesToUpdate = new HashSet<>();
|
||||
PDOptionalContentGroup layer = addLayerToDocument(pdDocument, dictionariesToUpdate, layerVisibilityDefaultValue);
|
||||
PDOptionalContentGroup visualLayoutParsingLayer = addLayerToDocument(pdDocument, dictionariesToUpdate, true);
|
||||
PDFont font = new PDType1Font(Standard14Fonts.FontName.HELVETICA);
|
||||
|
||||
for (int pageNumber = 0; pageNumber < pdDocument.getNumberOfPages(); pageNumber++) {
|
||||
@ -72,44 +80,25 @@ public class ViewerDocumentService {
|
||||
assert pageNumber == visualizationsOnPage.getPageNumber();
|
||||
// We need to append to the content stream, otherwise the content could be overlapped by following content.
|
||||
try (var contentStream = new PDPageContentStream(pdDocument, pdPage, PDPageContentStream.AppendMode.APPEND, true)) {
|
||||
|
||||
contentStream.beginMarkedContent(COSName.OC, layer);
|
||||
contentStream.beginMarkedContent(COSName.OC, visualLayoutParsingLayer);
|
||||
contentStream.saveGraphicsState();
|
||||
|
||||
contentStream.setLineWidth(LINE_WIDTH);
|
||||
for (ColoredLine coloredLine : visualizationsOnPage.getColoredLines()) {
|
||||
contentStream.setStrokingColor(coloredLine.color());
|
||||
contentStream.moveTo((float) coloredLine.line().getX1(), (float) coloredLine.line().getY1());
|
||||
contentStream.lineTo((float) coloredLine.line().getX2(), (float) coloredLine.line().getY2());
|
||||
for (TableExtractorCells tableCells : extractedTableCells.get(pageNumber)) {
|
||||
contentStream.setStrokingColor(new Color(0xB700FF));
|
||||
contentStream.addRect((float) tableCells.getX0(), (float) tableCells.getY0(), (float) tableCells.getWidth(), (float) tableCells.getHeight());
|
||||
contentStream.stroke();
|
||||
}
|
||||
for (ColoredRectangle coloredRectangle : visualizationsOnPage.getColoredRectangles()) {
|
||||
contentStream.setStrokingColor(coloredRectangle.color());
|
||||
Rectangle2D r = coloredRectangle.rectangle2D();
|
||||
contentStream.addRect((float) r.getX(), (float) r.getY(), (float) r.getWidth(), (float) r.getHeight());
|
||||
contentStream.stroke();
|
||||
}
|
||||
for (FilledRectangle filledRectangle : visualizationsOnPage.getFilledRectangles()) {
|
||||
contentStream.setNonStrokingColor(filledRectangle.color());
|
||||
PDExtendedGraphicsState graphicsState = new PDExtendedGraphicsState();
|
||||
graphicsState.setNonStrokingAlphaConstant(filledRectangle.alpha());
|
||||
contentStream.setGraphicsStateParameters(graphicsState);
|
||||
Rectangle2D r = filledRectangle.rectangle2D();
|
||||
contentStream.addRect((float) r.getX(), (float) r.getY(), (float) r.getWidth(), (float) r.getHeight());
|
||||
contentStream.fill();
|
||||
}
|
||||
for (PlacedText placedText : visualizationsOnPage.getPlacedTexts()) {
|
||||
contentStream.setFont(font, FONT_SIZE);
|
||||
contentStream.beginText();
|
||||
Matrix textMatrix = new Matrix((float) textDeRotationMatrix.getScaleX(),
|
||||
(float) textDeRotationMatrix.getShearX(),
|
||||
(float) textDeRotationMatrix.getShearY(),
|
||||
(float) textDeRotationMatrix.getScaleY(),
|
||||
(float) placedText.lineStart().getX(),
|
||||
(float) placedText.lineStart().getY());
|
||||
textMatrix.translate(-((font.getStringWidth(placedText.text()) / 1000) * FONT_SIZE + (2 * LINE_WIDTH) + 4), -FONT_SIZE);
|
||||
tableCells.getX0() ,
|
||||
tableCells.getY0());
|
||||
textMatrix.translate(-((font.getStringWidth(tableCells.getLabel()) / 1000) * FONT_SIZE + (2 * LINE_WIDTH) + 4), -FONT_SIZE);
|
||||
contentStream.setTextMatrix(textMatrix);
|
||||
contentStream.showText(placedText.text());
|
||||
contentStream.showText(tableCells.getLabel());
|
||||
contentStream.endText();
|
||||
}
|
||||
contentStream.restoreGraphicsState();
|
||||
|
||||
@ -1,12 +1,5 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.utils;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
|
||||
import lombok.experimental.UtilityClass;
|
||||
import org.apache.pdfbox.cos.COSName;
|
||||
import org.apache.pdfbox.pdmodel.documentinterchange.markedcontent.PDMarkedContent;
|
||||
import org.apache.pdfbox.text.TextPosition;
|
||||
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.util.Collection;
|
||||
import java.util.Collections;
|
||||
@ -14,12 +7,22 @@ import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import org.apache.pdfbox.cos.COSName;
|
||||
import org.apache.pdfbox.pdmodel.documentinterchange.markedcontent.PDMarkedContent;
|
||||
import org.apache.pdfbox.text.TextPosition;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
|
||||
|
||||
import lombok.experimental.UtilityClass;
|
||||
|
||||
@UtilityClass
|
||||
public class MarkedContentUtils {
|
||||
|
||||
public static final String HEADER = "Header";
|
||||
public static final String FOOTER = "Footer";
|
||||
|
||||
|
||||
public List<Rectangle2D> getMarkedContentBboxPerLine(List<PDMarkedContent> markedContents, String subtype) {
|
||||
|
||||
if (markedContents == null) {
|
||||
@ -31,7 +34,8 @@ public class MarkedContentUtils {
|
||||
.filter(m -> m.getProperties() != null)
|
||||
.filter(m -> m.getProperties().getItem("Subtype") != null)
|
||||
.filter(m -> ((COSName) m.getProperties().getItem("Subtype")).getName().equals(subtype))
|
||||
.map(PDMarkedContent::getContents).flatMap(Collection::stream)
|
||||
.map(PDMarkedContent::getContents)
|
||||
.flatMap(Collection::stream)
|
||||
.filter(t -> t instanceof TextPosition)
|
||||
.map(t -> (TextPosition) t)
|
||||
.filter(t -> !t.getUnicode().equals(" "))
|
||||
@ -41,16 +45,19 @@ public class MarkedContentUtils {
|
||||
return Collections.emptyList();
|
||||
}
|
||||
|
||||
return markedContentByYPosition.values().stream()
|
||||
.map(textPositions -> new TextPositionSequence(textPositions.stream()
|
||||
.toList(), 0, true)
|
||||
.getRectangle())
|
||||
.map(t -> new Rectangle2D.Float(t.getTopLeft().getX(), t.getTopLeft().getY() - Math.abs(t.getHeight()), t.getWidth(), Math.abs(t.getHeight()))).collect(Collectors.toList());
|
||||
return markedContentByYPosition.values()
|
||||
.stream()
|
||||
.map(textPositions -> new TextPositionSequence(textPositions.stream().toList(), 0, true).getRectangle())
|
||||
.map(t -> new Rectangle2D.Float(t.getTopLeft().getX(), t.getTopLeft().getY() - Math.abs(t.getHeight()), t.getWidth(), Math.abs(t.getHeight())))
|
||||
.collect(Collectors.toList());
|
||||
}
|
||||
|
||||
|
||||
public boolean intersects(TextPageBlock textBlock, Map<String, List<Rectangle2D>> markedContentBboxPerType, String type) {
|
||||
return markedContentBboxPerType.get(type) != null && markedContentBboxPerType.get(type).stream().anyMatch(rectangle -> rectangle.intersects(textBlock.getPdfMinX(), textBlock.getPdfMinY(), textBlock.getWidth(), textBlock.getHeight()));
|
||||
|
||||
return markedContentBboxPerType.get(type) != null && markedContentBboxPerType.get(type)
|
||||
.stream()
|
||||
.anyMatch(rectangle -> rectangle.intersects(textBlock.getPdfMinX(), textBlock.getPdfMinY(), textBlock.getWidth(), textBlock.getHeight()));
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@ -19,10 +19,9 @@ public final class PositionUtils {
|
||||
|
||||
double threshold = textBlock.getMostPopularWordHeight() * 3;
|
||||
|
||||
if (textBlock.getPdfMinX() + threshold > btf.getTopLeft().getX()
|
||||
&& textBlock.getPdfMaxX() - threshold < btf.getTopLeft().getX() + btf.getWidth()
|
||||
&& textBlock.getPdfMinY() + threshold > btf.getTopLeft().getY()
|
||||
&& textBlock.getPdfMaxY() - threshold < btf.getTopLeft().getY() + btf.getHeight()) {
|
||||
if (textBlock.getPdfMinX() + threshold > btf.getTopLeft().getX() && textBlock.getPdfMaxX() - threshold < btf.getTopLeft()
|
||||
.getX() + btf.getWidth() && textBlock.getPdfMinY() + threshold > btf.getTopLeft().getY() && textBlock.getPdfMaxY() - threshold < btf.getTopLeft()
|
||||
.getY() + btf.getHeight()) {
|
||||
return true;
|
||||
} else {
|
||||
return false;
|
||||
|
||||
@ -41,11 +41,14 @@ public class RectangleTransformations {
|
||||
|
||||
return atomicTextBlocks.stream().flatMap(atomicTextBlock -> atomicTextBlock.getPositions().stream()).collect(new Rectangle2DBBoxCollector());
|
||||
}
|
||||
|
||||
|
||||
public static Collector<Rectangle2D, Rectangle2DBBoxCollector.BBox, Rectangle2D> collectBBox() {
|
||||
|
||||
return new Rectangle2DBBoxCollector();
|
||||
}
|
||||
|
||||
|
||||
public static PDRectangle toPDRectangleBBox(List<Rectangle> rectangles) {
|
||||
|
||||
Rectangle2D rectangle2D = RectangleTransformations.rectangleBBox(rectangles);
|
||||
@ -70,6 +73,7 @@ public class RectangleTransformations {
|
||||
return format("%f,%f,%f,%f", rectangle2D.getX(), rectangle2D.getY(), rectangle2D.getWidth(), rectangle2D.getHeight());
|
||||
}
|
||||
|
||||
|
||||
public static Rectangle2D rectangleBBox(List<Rectangle> rectangles) {
|
||||
|
||||
return rectangles.stream().map(RectangleTransformations::toRectangle2D).collect(new Rectangle2DBBoxCollector());
|
||||
@ -84,6 +88,7 @@ public class RectangleTransformations {
|
||||
-redactionLogRectangle.getHeight());
|
||||
}
|
||||
|
||||
|
||||
public static Rectangle2D toRectangle2D(PDRectangle rectangle) {
|
||||
|
||||
return new Rectangle2D.Double(rectangle.getLowerLeftX(), rectangle.getLowerLeftY(), rectangle.getWidth(), rectangle.getHeight());
|
||||
|
||||
@ -3,7 +3,6 @@ package com.knecon.fforesight.service.layoutparser.processor.utils;
|
||||
import java.util.List;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
|
||||
|
||||
|
||||
@ -28,15 +28,13 @@ import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPosit
|
||||
*
|
||||
* @author Ben Litchfield
|
||||
*/
|
||||
public class TextPositionSequenceComparator implements Comparator<TextPositionSequence>
|
||||
{
|
||||
public class TextPositionSequenceComparator implements Comparator<TextPositionSequence> {
|
||||
|
||||
@Override
|
||||
public int compare(TextPositionSequence pos1, TextPositionSequence pos2)
|
||||
{
|
||||
public int compare(TextPositionSequence pos1, TextPositionSequence pos2) {
|
||||
// only compare text that is in the same direction
|
||||
int cmp1 = Float.compare(pos1.getDir().getDegrees(), pos2.getDir().getDegrees());
|
||||
if (cmp1 != 0)
|
||||
{
|
||||
if (cmp1 != 0) {
|
||||
return cmp1;
|
||||
}
|
||||
|
||||
@ -54,19 +52,13 @@ public class TextPositionSequenceComparator implements Comparator<TextPositionSe
|
||||
float yDifference = Math.abs(pos1YBottom - pos2YBottom);
|
||||
|
||||
// we will do a simple tolerance comparison
|
||||
if (yDifference < .1 ||
|
||||
pos2YBottom >= pos1YTop && pos2YBottom <= pos1YBottom ||
|
||||
pos1YBottom >= pos2YTop && pos1YBottom <= pos2YBottom)
|
||||
{
|
||||
if (yDifference < .1 || pos2YBottom >= pos1YTop && pos2YBottom <= pos1YBottom || pos1YBottom >= pos2YTop && pos1YBottom <= pos2YBottom) {
|
||||
return Float.compare(x1, x2);
|
||||
}
|
||||
else if (pos1YBottom < pos2YBottom)
|
||||
{
|
||||
} else if (pos1YBottom < pos2YBottom) {
|
||||
return -1;
|
||||
}
|
||||
else
|
||||
{
|
||||
} else {
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@ -14,7 +14,7 @@ import com.knecon.fforesight.service.layoutparser.server.queue.MessagingConfigur
|
||||
import com.knecon.fforesight.tenantcommons.MultiTenancyAutoConfiguration;
|
||||
|
||||
@ImportAutoConfiguration({MultiTenancyAutoConfiguration.class})
|
||||
@Import({MetricsConfiguration.class, StorageAutoConfiguration.class, LayoutParsingServiceProcessorConfiguration.class, MessagingConfiguration.class})
|
||||
@Import({MetricsConfiguration.class, StorageAutoConfiguration.class, LayoutParsingServiceProcessorConfiguration.class, MessagingConfiguration.class})
|
||||
@SpringBootApplication(exclude = {SecurityAutoConfiguration.class, ManagementWebSecurityAutoConfiguration.class})
|
||||
public class Application {
|
||||
|
||||
|
||||
@ -26,6 +26,7 @@ import com.knecon.fforesight.service.layoutparser.processor.LayoutParsingPipelin
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.image.ImageServiceResponse;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableExtractorResponse;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableServiceResponse;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.factory.DocumentGraphFactory;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.mapper.TaasDocumentDataMapper;
|
||||
@ -50,7 +51,7 @@ public class BdrJsonBuildTest extends AbstractTest {
|
||||
return DocumentGraphFactory.buildDocumentGraph(layoutParsingPipeline.parseLayout(LayoutParsingType.TAAS,
|
||||
pdDocument,
|
||||
new ImageServiceResponse(),
|
||||
new TableServiceResponse()));
|
||||
new TableServiceResponse(), new TableExtractorResponse()));
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@ -33,6 +33,7 @@ import com.knecon.fforesight.service.layoutparser.processor.LayoutParsingPipelin
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.SemanticNode;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.image.ImageServiceResponse;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableExtractorResponse;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableServiceResponse;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.factory.DocumentGraphFactory;
|
||||
import com.knecon.fforesight.tenantcommons.TenantsClient;
|
||||
@ -98,7 +99,7 @@ public class HeadlinesGoldStandardIntegrationTest {
|
||||
Document documentGraph = DocumentGraphFactory.buildDocumentGraph(layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER,
|
||||
Loader.loadPDF(pdfFileResource.getFile()),
|
||||
new ImageServiceResponse(),
|
||||
new TableServiceResponse()));
|
||||
new TableServiceResponse(), new TableExtractorResponse()));
|
||||
|
||||
var foundHeadlines = documentGraph.streamAllSubNodes()
|
||||
.map(SemanticNode::getHeadline)
|
||||
|
||||
@ -16,6 +16,7 @@ import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.Do
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.image.ImageServiceResponse;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableExtractorResponse;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableServiceResponse;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.factory.DocumentGraphFactory;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.mapper.DocumentDataMapper;
|
||||
@ -58,7 +59,7 @@ public class DocumentGraphJsonWritingTest extends BuildDocumentTest {
|
||||
Document documentGraph = DocumentGraphFactory.buildDocumentGraph(layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER,
|
||||
Loader.loadPDF(filename.toFile()),
|
||||
new ImageServiceResponse(),
|
||||
new TableServiceResponse()));
|
||||
new TableServiceResponse(), new TableExtractorResponse()));
|
||||
|
||||
DocumentData documentData = DocumentDataMapper.toDocumentData(documentGraph);
|
||||
ObjectMapper mapper = ObjectMapperFactory.create();
|
||||
|
||||
@ -1,27 +1,53 @@
|
||||
package com.knecon.fforesight.service.layoutparser.server.graph;
|
||||
|
||||
import static com.knecon.fforesight.service.layoutparser.server.utils.visualizations.PdfDraw.drawRectangle2DList;
|
||||
|
||||
import java.awt.Color;
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.awt.geom.RectangularShape;
|
||||
import java.io.File;
|
||||
import java.io.FileOutputStream;
|
||||
import java.io.InputStream;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashMap;
|
||||
import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
import org.apache.pdfbox.Loader;
|
||||
import org.apache.pdfbox.pdmodel.PDDocument;
|
||||
import org.json.JSONArray;
|
||||
import org.json.JSONObject;
|
||||
import org.junit.jupiter.api.Disabled;
|
||||
import org.junit.jupiter.api.Test;
|
||||
import org.springframework.beans.factory.annotation.Autowired;
|
||||
import org.springframework.core.io.ClassPathResource;
|
||||
|
||||
import com.fasterxml.jackson.databind.JsonNode;
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentData;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentStructure;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeType;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationDocument;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.GapInformation;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.LineInformation;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Table;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.Cell;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.Rectangle;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.TablePageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.image.ImageServiceResponse;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.ExtractedTable;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableCells;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableExtractorResponse;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableServiceResponse;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.GapDetectionService;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.GapsAcrossLinesService;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.LineDetectionService;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.SectionsBuilderService;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.classification.RedactManagerClassificationService;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.factory.DocumentGraphFactory;
|
||||
@ -29,7 +55,9 @@ import com.knecon.fforesight.service.layoutparser.processor.services.mapper.Docu
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.mapper.PropertiesMapper;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.visualization.LayoutGridService;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.visualization.ViewerDocumentService;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.PdfVisualisationUtility;
|
||||
import com.knecon.fforesight.service.layoutparser.server.utils.BuildDocumentTest;
|
||||
import com.knecon.fforesight.service.layoutparser.server.utils.visualizations.PdfDraw;
|
||||
|
||||
import lombok.SneakyThrows;
|
||||
|
||||
@ -41,6 +69,9 @@ public class ViewerDocumentTest extends BuildDocumentTest {
|
||||
@Autowired
|
||||
private RedactManagerClassificationService redactManagerClassificationService;
|
||||
|
||||
@Autowired
|
||||
private ObjectMapper objectMapper;
|
||||
|
||||
@Test
|
||||
@SneakyThrows
|
||||
public void testViewerDocument() {
|
||||
@ -51,16 +82,69 @@ public class ViewerDocumentTest extends BuildDocumentTest {
|
||||
ViewerDocumentService viewerDocumentService = new ViewerDocumentService(layoutGridService);
|
||||
Document document = buildGraph(fileName, LayoutParsingType.TAAS);
|
||||
try (var pdDocument = Loader.loadPDF(new ClassPathResource(fileName).getFile()); var out = new FileOutputStream(tmpFileName)) {
|
||||
viewerDocumentService.createViewerDocument(pdDocument, document, out, true);
|
||||
viewerDocumentService.createViewerDocument(pdDocument, document, out, null,true);
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
@SneakyThrows
|
||||
public void testLayoutParsingServiceResults() {
|
||||
String tableSourceFileName ="C:\\Users\\YannikHampe\\Downloads\\b28d9a22b674906813f12b86dda33202.EXTRACTED_TABLES.json\\b28d9a22b674906813f12b86dda33202.EXTRACTED_TABLES.json";
|
||||
Path pdfFileResource = Path.of("C:\\Users\\YannikHampe\\Downloads\\2009-1048395_50pages_tables.pdf");
|
||||
String tmpFileName = "C:/Users/YANNIK~1/AppData/Local/Temp/rectangles."+pdfFileResource.getFileName();
|
||||
PDDocument pdDocument = Loader.loadPDF(pdfFileResource.toFile());
|
||||
|
||||
try (InputStream inputStream = Files.newInputStream(Path.of(tableSourceFileName))) {
|
||||
TableExtractorResponse tableExtractorResponse = objectMapper.readValue(inputStream, TableExtractorResponse.class);
|
||||
tableExtractorResponse.getData().forEach(data -> {
|
||||
List<TableCells> tableCells = convertTableCells(data.getTables());
|
||||
});
|
||||
inputStream.close();
|
||||
}
|
||||
try (var out = new FileOutputStream(tmpFileName)) {
|
||||
pdDocument.save(out);
|
||||
pdDocument.close();
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
public List<TableCells> convertTableCells(List<ExtractedTable> tableObjects) {
|
||||
|
||||
List<TableCells> parsedTableCells = new ArrayList<>();
|
||||
|
||||
tableObjects.stream().forEach(t -> {
|
||||
System.out.println(t.getTable().getLabel());
|
||||
TableCells tableCells = new TableCells();
|
||||
tableCells.setX0(t.getTable().getBbox().get(0));
|
||||
tableCells.setX1(t.getTable().getBbox().get(2));
|
||||
tableCells.setY0(t.getTable().getBbox().get(1));
|
||||
tableCells.setY1(t.getTable().getBbox().get(3));
|
||||
tableCells.setWidth(tableCells.getX1()- tableCells.getX0());
|
||||
tableCells.setHeight(tableCells.getY1()- tableCells.getY0());
|
||||
parsedTableCells.add(tableCells);
|
||||
t.getObjects().forEach(o -> {
|
||||
System.out.println(o.getLabel());
|
||||
TableCells objectCell = new TableCells();
|
||||
objectCell.setX0(t.getTable().getBbox().get(0));
|
||||
objectCell.setX1(t.getTable().getBbox().get(2));
|
||||
objectCell.setY0(t.getTable().getBbox().get(1));
|
||||
objectCell.setY1(t.getTable().getBbox().get(3));
|
||||
objectCell.setWidth(objectCell.getX1()- objectCell.getX0());
|
||||
objectCell.setHeight(objectCell.getY1()- objectCell.getY0());
|
||||
parsedTableCells.add(objectCell);
|
||||
});
|
||||
});
|
||||
|
||||
return parsedTableCells;
|
||||
|
||||
}
|
||||
|
||||
public ClassificationDocument buildClassificationDocument(PDDocument originDocument) {
|
||||
|
||||
ClassificationDocument classificationDocument = layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER,
|
||||
originDocument,
|
||||
new ImageServiceResponse(),
|
||||
new TableServiceResponse());
|
||||
new TableServiceResponse(), new TableExtractorResponse());
|
||||
|
||||
redactManagerClassificationService.classifyDocument(classificationDocument);
|
||||
|
||||
|
||||
@ -33,6 +33,7 @@ import com.knecon.fforesight.service.layoutparser.processor.model.table.TablePag
|
||||
import com.knecon.fforesight.service.layoutparser.processor.python_api.adapter.CvTableParsingAdapter;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.python_api.adapter.ImageServiceResponseAdapter;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.image.ImageServiceResponse;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableExtractorResponse;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableServiceResponse;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.SectionsBuilderService;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.classification.RedactManagerClassificationService;
|
||||
@ -67,7 +68,7 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
||||
ClassificationDocument classificationDocument = layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER,
|
||||
originDocument,
|
||||
new ImageServiceResponse(),
|
||||
tableServiceResponse);
|
||||
tableServiceResponse, new TableExtractorResponse());
|
||||
|
||||
redactManagerClassificationService.classifyDocument(classificationDocument);
|
||||
|
||||
|
||||
@ -20,6 +20,7 @@ import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Do
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Table;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.image.ImageServiceResponse;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableExtractorResponse;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableServiceResponse;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.PageContentExtractor;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.RulingCleaningService;
|
||||
@ -79,11 +80,11 @@ public class RulingCleaningServiceTest extends BuildDocumentTest {
|
||||
Document documentGraphBefore = DocumentGraphFactory.buildDocumentGraph(layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER,
|
||||
Loader.loadPDF(filename.toFile()),
|
||||
new ImageServiceResponse(),
|
||||
new TableServiceResponse()));
|
||||
new TableServiceResponse(), new TableExtractorResponse()));
|
||||
Document documentGraphAfter = DocumentGraphFactory.buildDocumentGraph(layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER,
|
||||
Loader.loadPDF(filename.toFile()),
|
||||
new ImageServiceResponse(),
|
||||
new TableServiceResponse()));
|
||||
new TableServiceResponse(), new TableExtractorResponse()));
|
||||
DocumentData documentDataBefore = DocumentDataMapper.toDocumentData(documentGraphBefore);
|
||||
DocumentData documentDataAfter = DocumentDataMapper.toDocumentData(documentGraphAfter);
|
||||
if (!compareStructures(documentDataBefore.getDocumentStructure(), documentDataAfter.getDocumentStructure(), filename.getFileName().toString())) {
|
||||
|
||||
@ -26,6 +26,8 @@ import org.springframework.test.context.junit.jupiter.SpringExtension;
|
||||
import java.io.InputStream;
|
||||
import java.util.Optional;
|
||||
|
||||
import javax.swing.text.html.Option;
|
||||
|
||||
@ExtendWith(SpringExtension.class)
|
||||
@SpringBootTest(classes = Application.class, webEnvironment = SpringBootTest.WebEnvironment.RANDOM_PORT)
|
||||
@Import(AbstractTest.TestConfiguration.class)
|
||||
@ -46,6 +48,8 @@ public abstract class AbstractTest {
|
||||
protected final static String ORIGIN_FILE_ID = "origin";
|
||||
protected final static String TABLE_FILE_ID = "table";
|
||||
protected final static String IMAGE_FILE_ID = "image";
|
||||
|
||||
protected final static String TABLE_EXTRACTOR_FILE_ID = "extractedTable";
|
||||
protected final static String STRUCTURE_FILE_ID = "structure";
|
||||
protected final static String TEXT_FILE_ID = "texts";
|
||||
protected final static String POSITION_FILE_ID = "positions";
|
||||
@ -62,6 +66,7 @@ public abstract class AbstractTest {
|
||||
.originFileStorageId(ORIGIN_FILE_ID)
|
||||
.tablesFileStorageId(Optional.of(TABLE_FILE_ID))
|
||||
.imagesFileStorageId(Optional.of(IMAGE_FILE_ID))
|
||||
.tableExtractorFileId(Optional.of(TABLE_EXTRACTOR_FILE_ID))
|
||||
.structureFileStorageId(STRUCTURE_FILE_ID)
|
||||
.textBlockFileStorageId(TEXT_FILE_ID)
|
||||
.positionBlockFileStorageId(POSITION_FILE_ID)
|
||||
@ -89,7 +94,7 @@ public abstract class AbstractTest {
|
||||
@SneakyThrows
|
||||
protected LayoutParsingRequest prepareStorage(String file) {
|
||||
|
||||
return prepareStorage(file, "cv_table_parsing_response/empty.json", "image_service_response/empty.json");
|
||||
return prepareStorage(file, "cv_table_parsing_response/empty.json", "image_service_response/empty.json","table_extractor_response/empty.json");
|
||||
}
|
||||
|
||||
|
||||
@ -107,6 +112,7 @@ public abstract class AbstractTest {
|
||||
.originFileStorageId(ORIGIN_FILE_ID)
|
||||
.tablesFileStorageId(Optional.of(TABLE_FILE_ID))
|
||||
.imagesFileStorageId(Optional.of(IMAGE_FILE_ID))
|
||||
.tableExtractorFileId(Optional.of(TABLE_EXTRACTOR_FILE_ID))
|
||||
.structureFileStorageId(STRUCTURE_FILE_ID)
|
||||
.textBlockFileStorageId(TEXT_FILE_ID)
|
||||
.positionBlockFileStorageId(POSITION_FILE_ID)
|
||||
@ -117,21 +123,23 @@ public abstract class AbstractTest {
|
||||
}
|
||||
|
||||
@SneakyThrows
|
||||
protected LayoutParsingRequest prepareStorage(String file, String cvServiceResponseFile, String imageInfoFile) {
|
||||
protected LayoutParsingRequest prepareStorage(String file, String cvServiceResponseFile, String imageInfoFile, String tableExtractorResponseFile) {
|
||||
|
||||
ClassPathResource pdfFileResource = new ClassPathResource(file);
|
||||
ClassPathResource cvServiceResponseFileResource = new ClassPathResource(cvServiceResponseFile);
|
||||
ClassPathResource imageInfoFileResource = new ClassPathResource(imageInfoFile);
|
||||
ClassPathResource tableExtractorResponseFileRessource = new ClassPathResource(tableExtractorResponseFile);
|
||||
|
||||
return prepareStorage(pdfFileResource.getInputStream(), cvServiceResponseFileResource.getInputStream(), imageInfoFileResource.getInputStream());
|
||||
return prepareStorage(pdfFileResource.getInputStream(), cvServiceResponseFileResource.getInputStream(), imageInfoFileResource.getInputStream(),tableExtractorResponseFileRessource.getInputStream());
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
protected LayoutParsingRequest prepareStorage(InputStream fileStream, InputStream cvServiceResponseFileStream, InputStream imageInfoStream) {
|
||||
protected LayoutParsingRequest prepareStorage(InputStream fileStream, InputStream cvServiceResponseFileStream, InputStream imageInfoStream, InputStream tableExtractorResponseFileStream) {
|
||||
|
||||
storageService.storeObject(TenantContext.getTenantId(), IMAGE_FILE_ID, imageInfoStream);
|
||||
storageService.storeObject(TenantContext.getTenantId(), TABLE_FILE_ID, cvServiceResponseFileStream);
|
||||
storageService.storeObject(TenantContext.getTenantId(), TABLE_EXTRACTOR_FILE_ID, tableExtractorResponseFileStream);
|
||||
storageService.storeObject(TenantContext.getTenantId(), ORIGIN_FILE_ID, fileStream);
|
||||
|
||||
return buildDefaultLayoutParsingRequest(LayoutParsingType.REDACT_MANAGER);
|
||||
|
||||
@ -11,6 +11,7 @@ import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsi
|
||||
import com.knecon.fforesight.service.layoutparser.processor.LayoutParsingPipeline;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationDocument;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableExtractorResponse;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableServiceResponse;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.factory.DocumentGraphFactory;
|
||||
|
||||
@ -28,7 +29,7 @@ public abstract class BuildDocumentTest extends AbstractTest {
|
||||
ClassPathResource fileResource = new ClassPathResource(filename);
|
||||
prepareStorage(filename);
|
||||
try (InputStream inputStream = fileResource.getInputStream(); PDDocument pdDocument = Loader.loadPDF(inputStream.readAllBytes())) {
|
||||
return layoutParsingPipeline.parseLayout(layoutParsingType, pdDocument, layoutParsingStorageService.getImagesFile(IMAGE_FILE_ID), new TableServiceResponse());
|
||||
return layoutParsingPipeline.parseLayout(layoutParsingType, pdDocument, layoutParsingStorageService.getImagesFile(IMAGE_FILE_ID), new TableServiceResponse(), new TableExtractorResponse());
|
||||
}
|
||||
}
|
||||
|
||||
@ -44,7 +45,7 @@ public abstract class BuildDocumentTest extends AbstractTest {
|
||||
protected Document buildGraph(String filename, LayoutParsingType layoutParsingType) {
|
||||
|
||||
if (filename.equals("files/S-Metolachlor_RAR_01_Volume_1_2018-09-06.pdf")) {
|
||||
prepareStorage(filename, "cv_table_parsing_response/empty.json", "image_service_response/S-Metolachlor_RAR_01_Volume_1_2018-09-06.IMAGE_INFO.json");
|
||||
prepareStorage(filename, "cv_table_parsing_response/empty.json", "image_service_response/S-Metolachlor_RAR_01_Volume_1_2018-09-06.IMAGE_INFO.json","table_extractor_response/empty.json");
|
||||
} else {
|
||||
prepareStorage(filename);
|
||||
}
|
||||
|
||||
@ -0,0 +1,8 @@
|
||||
{
|
||||
"dossierId": "123",
|
||||
"fileId": "123",
|
||||
"operation": "table",
|
||||
"targetFileExtension": "ORIGIN.pdf.gz",
|
||||
"responseFileExtension": "TABLES.json.gz",
|
||||
"extractedTableData": []
|
||||
}
|
||||
@ -4,5 +4,5 @@ gradle assemble
|
||||
|
||||
buildNumber=${1:-1}
|
||||
|
||||
gradle bootBuildImage --cleanCache --publishImage -PbuildbootDockerHostNetwork=true -Pversion=$USER-$buildNumber
|
||||
echo "nexus.knecon.com:5001/red/${dir}-server-v1:$USER-$buildNumber"
|
||||
gradle bootBuildImage --cleanCache --publishImage -Pversion=layout-parser-yannik-$buildNumber --stacktrace
|
||||
echo "nexus.knecon.com:5001/red/${dir}-server-v1:table-extractor-yannik-$buildNumber"
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user