Compare commits

...

13 Commits

Author SHA1 Message Date
yhampe
aa1e77dda3 RED-7375 table extractor prototype 2024-02-01 08:23:53 +01:00
yhampe
0c3b910088 RED-7375 table extractor prototype 2024-01-30 11:59:18 +01:00
yhampe
b4e5f2da2f RED-7375 table extractor prototype
added label
changed colour of bounding boxes
2024-01-17 16:58:53 +01:00
yhampe
b47b187c8a RED-7375: table extractor prototype
added layer for table extractor results in ViewerDocumentService
2024-01-15 09:43:23 +01:00
yhampe
0d4800622d RED-7375:Integrate Table Parsing Service
* integrated table parsing service
* for visualization simpy drawing the extracted table in ViewerDocumentService
2023-12-01 13:26:16 +01:00
yhampe
cc492bc50d RED-7375:Integrate Table Parsing Service
* fixed merging mistake
2023-11-29 10:06:05 +01:00
yhampe
1c3c5385e1 Merge remote-tracking branch 'origin/RED-7375' into RED-7375 2023-11-29 09:40:24 +01:00
yhampe
bce2558133 RED-7375:Integrate Table Parsing Service
* fixed end2end test by introducing sample table extractor response
2023-11-29 09:39:58 +01:00
yhampe
2200574b6d RED-7375:Integrate Table Parsing Service
* disabled not needed test
2023-11-29 09:39:58 +01:00
yhampe
775c943f7e RED-7375:
* using ViewerDocumentService to draw TableExtractorResponse into documents
2023-11-29 09:39:53 +01:00
yhampe
b7fe6fd3c4 RED-7375:Integrate Table Parsing Service
* fixed end2end test by introducing sample table extractor response
2023-11-29 09:37:47 +01:00
yhampe
3dd447ebef RED-7375:Integrate Table Parsing Service
* disabled not needed test
2023-11-29 08:41:31 +01:00
yhampe
f4e93ef03b RED-7375:
* using ViewerDocumentService to draw TableExtractorResponse into documents
2023-11-28 15:14:04 +01:00
58 changed files with 516 additions and 210 deletions

View File

@ -20,6 +20,9 @@ public record LayoutParsingRequest(
@NonNull String originFileStorageId,// @NonNull String originFileStorageId,//
@Schema(description = "Optional Path to the table extraction file.")// @Schema(description = "Optional Path to the table extraction file.")//
Optional<String> tablesFileStorageId,// Optional<String> tablesFileStorageId,//
@Schema(description= "Optional Path to the the table parsing service file")
Optional<String> tableExtractorFileId,
@Schema(description = "Optional Path to the image classification file.")// @Schema(description = "Optional Path to the image classification file.")//
Optional<String> imagesFileStorageId,// Optional<String> imagesFileStorageId,//

View File

@ -30,8 +30,11 @@ import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageB
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence; import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
import com.knecon.fforesight.service.layoutparser.processor.python_api.adapter.CvTableParsingAdapter; import com.knecon.fforesight.service.layoutparser.processor.python_api.adapter.CvTableParsingAdapter;
import com.knecon.fforesight.service.layoutparser.processor.python_api.adapter.ImageServiceResponseAdapter; import com.knecon.fforesight.service.layoutparser.processor.python_api.adapter.ImageServiceResponseAdapter;
import com.knecon.fforesight.service.layoutparser.processor.python_api.adapter.TableExtractorResponseAdapter;
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.image.ImageServiceResponse; import com.knecon.fforesight.service.layoutparser.processor.python_api.model.image.ImageServiceResponse;
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableCells; import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableCells;
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableExtractorCells;
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableExtractorResponse;
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableServiceResponse; import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableServiceResponse;
import com.knecon.fforesight.service.layoutparser.processor.services.BodyTextFrameService; import com.knecon.fforesight.service.layoutparser.processor.services.BodyTextFrameService;
import com.knecon.fforesight.service.layoutparser.processor.services.RulingCleaningService; import com.knecon.fforesight.service.layoutparser.processor.services.RulingCleaningService;
@ -63,6 +66,7 @@ public class LayoutParsingPipeline {
private final ImageServiceResponseAdapter imageServiceResponseAdapter; private final ImageServiceResponseAdapter imageServiceResponseAdapter;
private final CvTableParsingAdapter cvTableParsingAdapter; private final CvTableParsingAdapter cvTableParsingAdapter;
private final LayoutParsingStorageService layoutParsingStorageService; private final LayoutParsingStorageService layoutParsingStorageService;
private final TableExtractorResponseAdapter tableExtractorResponseAdapter;
private final SectionsBuilderService sectionsBuilderService; private final SectionsBuilderService sectionsBuilderService;
private final TaasClassificationService taasClassificationService; private final TaasClassificationService taasClassificationService;
private final RedactManagerClassificationService redactManagerClassificationService; private final RedactManagerClassificationService redactManagerClassificationService;
@ -87,12 +91,21 @@ public class LayoutParsingPipeline {
imageServiceResponse = layoutParsingStorageService.getImagesFile(layoutParsingRequest.imagesFileStorageId().get()); imageServiceResponse = layoutParsingStorageService.getImagesFile(layoutParsingRequest.imagesFileStorageId().get());
} }
TableExtractorResponse tableExtractorResponse = new TableExtractorResponse();
if (layoutParsingRequest.tableExtractorFileId().isPresent()) {
tableExtractorResponse = layoutParsingStorageService.getExtractedTableFile(layoutParsingRequest.tableExtractorFileId().get());
}
TableServiceResponse tableServiceResponse = new TableServiceResponse(); TableServiceResponse tableServiceResponse = new TableServiceResponse();
if (layoutParsingRequest.tablesFileStorageId().isPresent()) { if (layoutParsingRequest.tablesFileStorageId().isPresent()) {
tableServiceResponse = layoutParsingStorageService.getTablesFile(layoutParsingRequest.tablesFileStorageId().get()); tableServiceResponse = layoutParsingStorageService.getTablesFile(layoutParsingRequest.tablesFileStorageId().get());
} }
ClassificationDocument classificationDocument = parseLayout(layoutParsingRequest.layoutParsingType(), originDocument, imageServiceResponse, tableServiceResponse); ClassificationDocument classificationDocument = parseLayout(layoutParsingRequest.layoutParsingType(),
originDocument,
imageServiceResponse,
tableServiceResponse,
tableExtractorResponse);
Document documentGraph = DocumentGraphFactory.buildDocumentGraph(classificationDocument); Document documentGraph = DocumentGraphFactory.buildDocumentGraph(classificationDocument);
int numberOfPages = originDocument.getNumberOfPages(); int numberOfPages = originDocument.getNumberOfPages();
@ -100,8 +113,9 @@ public class LayoutParsingPipeline {
layoutParsingStorageService.storeDocumentData(layoutParsingRequest, DocumentDataMapper.toDocumentData(documentGraph)); layoutParsingStorageService.storeDocumentData(layoutParsingRequest, DocumentDataMapper.toDocumentData(documentGraph));
layoutParsingStorageService.storeSimplifiedText(layoutParsingRequest, simplifiedSectionTextService.toSimplifiedText(documentGraph)); layoutParsingStorageService.storeSimplifiedText(layoutParsingRequest, simplifiedSectionTextService.toSimplifiedText(documentGraph));
Map<Integer, List<TableExtractorCells>> extractedTableCells = tableExtractorResponseAdapter.buildExtractedTablesPerPage(tableExtractorResponse);
try (var out = new ByteArrayOutputStream()) { try (var out = new ByteArrayOutputStream()) {
viewerDocumentService.createViewerDocument(originDocument, documentGraph, out, false); viewerDocumentService.createViewerDocument(originDocument, documentGraph, out,extractedTableCells ,false);
layoutParsingStorageService.storeViewerDocument(layoutParsingRequest, out); layoutParsingStorageService.storeViewerDocument(layoutParsingRequest, out);
} }
@ -157,11 +171,12 @@ public class LayoutParsingPipeline {
public ClassificationDocument parseLayout(LayoutParsingType layoutParsingType, public ClassificationDocument parseLayout(LayoutParsingType layoutParsingType,
PDDocument originDocument, PDDocument originDocument,
ImageServiceResponse imageServiceResponse, ImageServiceResponse imageServiceResponse,
TableServiceResponse tableServiceResponse) { TableServiceResponse tableServiceResponse,
TableExtractorResponse tableExtractorResponse) {
Map<Integer, List<TableCells>> pdfTableCells = cvTableParsingAdapter.buildCvParsedTablesPerPage(tableServiceResponse); Map<Integer, List<TableCells>> pdfTableCells = cvTableParsingAdapter.buildCvParsedTablesPerPage(tableServiceResponse);
Map<Integer, List<ClassifiedImage>> pdfImages = imageServiceResponseAdapter.buildClassifiedImagesPerPage(imageServiceResponse); Map<Integer, List<ClassifiedImage>> pdfImages = imageServiceResponseAdapter.buildClassifiedImagesPerPage(imageServiceResponse);
Map<Integer, List<TableExtractorCells>> extractedTableCells = tableExtractorResponseAdapter.buildExtractedTablesPerPage(tableExtractorResponse);
ClassificationDocument classificationDocument = new ClassificationDocument(); ClassificationDocument classificationDocument = new ClassificationDocument();
List<ClassificationPage> classificationPages = new ArrayList<>(); List<ClassificationPage> classificationPages = new ArrayList<>();
@ -244,9 +259,9 @@ public class LayoutParsingPipeline {
private void increaseDocumentStatistics(ClassificationPage classificationPage, ClassificationDocument document) { private void increaseDocumentStatistics(ClassificationPage classificationPage, ClassificationDocument document) {
if (!classificationPage.isLandscape()) { if (!classificationPage.isLandscape()) {
document.getFontSizeCounter().addAll(classificationPage.getFontSizeCounter().getCountPerValue()); document.getFontSizeCounter().addAll(classificationPage.getFontSizeCounter().getCountPerValue());
} }
document.getFontCounter().addAll(classificationPage.getFontCounter().getCountPerValue()); document.getFontCounter().addAll(classificationPage.getFontCounter().getCountPerValue());
document.getTextHeightCounter().addAll(classificationPage.getTextHeightCounter().getCountPerValue()); document.getTextHeightCounter().addAll(classificationPage.getTextHeightCounter().getCountPerValue());
document.getFontStyleCounter().addAll(classificationPage.getFontStyleCounter().getCountPerValue()); document.getFontStyleCounter().addAll(classificationPage.getFontStyleCounter().getCountPerValue());

View File

@ -23,6 +23,7 @@ import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.Si
import com.knecon.fforesight.service.layoutparser.internal.api.data.taas.ResearchDocumentData; import com.knecon.fforesight.service.layoutparser.internal.api.data.taas.ResearchDocumentData;
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingRequest; import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingRequest;
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.image.ImageServiceResponse; import com.knecon.fforesight.service.layoutparser.processor.python_api.model.image.ImageServiceResponse;
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableExtractorResponse;
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableServiceResponse; import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableServiceResponse;
import com.knecon.fforesight.tenantcommons.TenantContext; import com.knecon.fforesight.tenantcommons.TenantContext;
@ -63,6 +64,16 @@ public class LayoutParsingStorageService {
} }
public TableExtractorResponse getExtractedTableFile(String storageId) throws IOException {
try (InputStream inputStream = getObject(storageId)) {
TableExtractorResponse tableExtractorResponse = objectMapper.readValue(inputStream, TableExtractorResponse.class);
inputStream.close();
return tableExtractorResponse;
}
}
public TableServiceResponse getTablesFile(String storageId) throws IOException { public TableServiceResponse getTablesFile(String storageId) throws IOException {
try (var tableClassificationStream = getObject(storageId)) { try (var tableClassificationStream = getObject(storageId)) {
@ -83,7 +94,6 @@ public class LayoutParsingStorageService {
} }
public void storeResearchDocumentData(LayoutParsingRequest layoutParsingRequest, ResearchDocumentData researchDocumentData) { public void storeResearchDocumentData(LayoutParsingRequest layoutParsingRequest, ResearchDocumentData researchDocumentData) {
storageService.storeJSONObject(TenantContext.getTenantId(), layoutParsingRequest.researchDocumentStorageId(), researchDocumentData); storageService.storeJSONObject(TenantContext.getTenantId(), layoutParsingRequest.researchDocumentStorageId(), researchDocumentData);

View File

@ -14,7 +14,6 @@ import com.knecon.fforesight.service.layoutparser.processor.model.text.StringFre
import lombok.Data; import lombok.Data;
import lombok.NonNull; import lombok.NonNull;
import lombok.RequiredArgsConstructor; import lombok.RequiredArgsConstructor;
import org.apache.pdfbox.pdmodel.documentinterchange.markedcontent.PDMarkedContent;
@Data @Data
@RequiredArgsConstructor @RequiredArgsConstructor

View File

@ -19,4 +19,5 @@ public class PageContents {
Rectangle2D cropBox; Rectangle2D cropBox;
Rectangle2D mediaBox; Rectangle2D mediaBox;
List<Ruling> rulings; List<Ruling> rulings;
} }

View File

@ -108,11 +108,13 @@ public class Boundary implements Comparable<Boundary> {
return splitBoundaries; return splitBoundaries;
} }
public IntStream intStream() { public IntStream intStream() {
return IntStream.range(start, end); return IntStream.range(start, end);
} }
public static Boundary merge(Collection<Boundary> boundaries) { public static Boundary merge(Collection<Boundary> boundaries) {
int minStart = boundaries.stream().mapToInt(Boundary::start).min().orElseThrow(IllegalArgumentException::new); int minStart = boundaries.stream().mapToInt(Boundary::start).min().orElseThrow(IllegalArgumentException::new);

View File

@ -105,6 +105,7 @@ public class Document implements GenericSemanticNode {
return streamAllSubNodes().collect(Collectors.groupingBy(SemanticNode::getType, Collectors.counting())); return streamAllSubNodes().collect(Collectors.groupingBy(SemanticNode::getType, Collectors.counting()));
} }
@Override @Override
public String toString() { public String toString() {

View File

@ -207,6 +207,7 @@ public class Table implements SemanticNode {
return IntStream.range(0, numberOfCols).boxed().map(col -> getCell(row, col)); return IntStream.range(0, numberOfCols).boxed().map(col -> getCell(row, col));
} }
/** /**
* Streams all TableCells row-wise and filters them with header == true. * Streams all TableCells row-wise and filters them with header == true.
* *

View File

@ -109,10 +109,7 @@ public class AtomicTextBlock implements TextBlock {
} }
public static AtomicTextBlock fromAtomicTextBlockData(DocumentTextData documentTextData, public static AtomicTextBlock fromAtomicTextBlockData(DocumentTextData documentTextData, DocumentPositionData documentPositionData, SemanticNode parent, Page page) {
DocumentPositionData documentPositionData,
SemanticNode parent,
Page page) {
return AtomicTextBlock.builder() return AtomicTextBlock.builder()
.id(documentTextData.getId()) .id(documentTextData.getId())

View File

@ -1,14 +1,12 @@
package com.knecon.fforesight.service.layoutparser.processor.model.table; package com.knecon.fforesight.service.layoutparser.processor.model.table;
import java.awt.geom.Point2D; import java.awt.geom.Point2D;
import java.awt.geom.Rectangle2D;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.Collections; import java.util.Collections;
import java.util.HashSet; import java.util.HashSet;
import java.util.List; import java.util.List;
import java.util.Set; import java.util.Set;
import java.util.TreeMap; import java.util.TreeMap;
import java.util.stream.Collectors;
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock; import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.PageBlockType; import com.knecon.fforesight.service.layoutparser.processor.model.PageBlockType;
@ -50,6 +48,7 @@ public class TablePageBlock extends AbstractPageBlock {
return getColCount() == 0 || getRowCount() == 0; return getColCount() == 0 || getRowCount() == 0;
} }
public List<List<Cell>> getRows() { public List<List<Cell>> getRows() {
if (rows == null) { if (rows == null) {
@ -276,21 +275,17 @@ public class TablePageBlock extends AbstractPageBlock {
} }
public boolean intersects(Cell cell1, Cell cell2) { public boolean intersects(Cell cell1, Cell cell2) {
if (cell1.getHeight() <= 0 || cell2.getHeight() <= 0) { if (cell1.getHeight() <= 0 || cell2.getHeight() <= 0) {
return false; return false;
} }
double x0 = cell1.getX() + 2; double x0 = cell1.getX() + 2;
double y0 = cell1.getY() + 2; double y0 = cell1.getY() + 2;
return (cell2.x + cell2.width > x0 && return (cell2.x + cell2.width > x0 && cell2.y + cell2.height > y0 && cell2.x < x0 + cell1.getWidth() - 2 && cell2.y < y0 + cell1.getHeight() - 2);
cell2.y + cell2.height > y0 &&
cell2.x < x0 + cell1.getWidth() -2 &&
cell2.y < y0 + cell1.getHeight() -2);
} }
@Override @Override
public String getText() { public String getText() {
@ -328,8 +323,6 @@ public class TablePageBlock extends AbstractPageBlock {
} }
public String getTextAsHtml() { public String getTextAsHtml() {
StringBuilder sb = new StringBuilder(); StringBuilder sb = new StringBuilder();

View File

@ -2,6 +2,7 @@ package com.knecon.fforesight.service.layoutparser.processor.model.text;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.List; import java.util.List;
import com.knecon.fforesight.service.layoutparser.processor.utils.TextNormalizationUtilities; import com.knecon.fforesight.service.layoutparser.processor.utils.TextNormalizationUtilities;
import lombok.Getter; import lombok.Getter;

View File

@ -73,7 +73,7 @@ public class TextPageBlock extends AbstractPageBlock {
return sequences.get(0).getPageWidth(); return sequences.get(0).getPageWidth();
} }
public static TextPageBlock merge(List<TextPageBlock> textBlocksToMerge) { public static TextPageBlock merge(List<TextPageBlock> textBlocksToMerge) {
@ -82,6 +82,7 @@ public class TextPageBlock extends AbstractPageBlock {
return fromTextPositionSequences(sequences); return fromTextPositionSequences(sequences);
} }
public static TextPageBlock fromTextPositionSequences(List<TextPositionSequence> wordBlockList) { public static TextPageBlock fromTextPositionSequences(List<TextPositionSequence> wordBlockList) {
TextPageBlock textBlock = null; TextPageBlock textBlock = null;
@ -133,7 +134,6 @@ public class TextPageBlock extends AbstractPageBlock {
} }
/** /**
* Returns the minX value in pdf coordinate system. * Returns the minX value in pdf coordinate system.
* Note: This needs to use Pdf Coordinate System where {0,0} rotated with the page rotation. * Note: This needs to use Pdf Coordinate System where {0,0} rotated with the page rotation.

View File

@ -234,6 +234,7 @@ public class TextPositionSequence implements CharSequence {
@JsonIgnore @JsonIgnore
@JsonAttribute(ignore = true) @JsonAttribute(ignore = true)
public String getFontStyle() { public String getFontStyle() {
if (textPositions.get(0).getFontName() == null) { if (textPositions.get(0).getFontName() == null) {
return "standard"; return "standard";
} }

View File

@ -9,10 +9,10 @@ import java.util.Map;
import org.springframework.stereotype.Service; import org.springframework.stereotype.Service;
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.image.ImageServiceResponse;
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage; import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
import com.knecon.fforesight.service.layoutparser.processor.model.image.ClassifiedImage;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.ImageType; import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.ImageType;
import com.knecon.fforesight.service.layoutparser.processor.model.image.ClassifiedImage;
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.image.ImageServiceResponse;
import lombok.RequiredArgsConstructor; import lombok.RequiredArgsConstructor;
@ -20,8 +20,7 @@ import lombok.RequiredArgsConstructor;
@RequiredArgsConstructor @RequiredArgsConstructor
public class ImageServiceResponseAdapter { public class ImageServiceResponseAdapter {
public Map<Integer, List<ClassifiedImage>> buildClassifiedImagesPerPage(ImageServiceResponse imageServiceResponse) {
public Map<Integer, List<ClassifiedImage>> buildClassifiedImagesPerPage(ImageServiceResponse imageServiceResponse ) {
Map<Integer, List<ClassifiedImage>> images = new HashMap<>(); Map<Integer, List<ClassifiedImage>> images = new HashMap<>();
imageServiceResponse.getData().forEach(imageMetadata -> { imageServiceResponse.getData().forEach(imageMetadata -> {

View File

@ -0,0 +1,74 @@
package com.knecon.fforesight.service.layoutparser.processor.python_api.adapter;
import java.awt.geom.Rectangle2D;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import org.springframework.stereotype.Service;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.ImageType;
import com.knecon.fforesight.service.layoutparser.processor.model.image.ClassifiedImage;
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.image.ImageServiceResponse;
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.ExtractedTable;
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.ExtractedTableData;
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableCells;
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableData;
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableExtractorCells;
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableExtractorResponse;
import lombok.RequiredArgsConstructor;
import lombok.extern.slf4j.Slf4j;
@Service
@Slf4j
@RequiredArgsConstructor
public class TableExtractorResponseAdapter {
public Map<Integer, List<TableExtractorCells>> buildExtractedTablesPerPage(TableExtractorResponse tableExtractorResponse) {
Map<Integer, List<TableExtractorCells>> tableCells = new HashMap<>();
tableExtractorResponse.getData()
.forEach(tableData -> tableCells.computeIfAbsent(tableData.getPage_number(), tableCell -> new ArrayList<>())
.addAll(convertTableCells(tableData.getTables())));
return tableCells;
}
public List<TableExtractorCells> convertTableCells(List<ExtractedTable> tableObjects) {
List<TableExtractorCells> parsedTableCells = new ArrayList<>();
tableObjects.stream().forEach(t -> {
TableExtractorCells tableCells = new TableExtractorCells();
tableCells.setX0(t.getTable().getBbox().get(0));
tableCells.setX1(t.getTable().getBbox().get(2));
tableCells.setY0(t.getTable().getBbox().get(1));
tableCells.setY1(t.getTable().getBbox().get(3));
tableCells.setWidth(tableCells.getX1()- tableCells.getX0());
tableCells.setHeight(tableCells.getY1()- tableCells.getY0());
tableCells.setLabel(t.getTable().getLabel());
log.info("Parsed table cell {} with label {}",tableCells, tableCells.getLabel());
parsedTableCells.add(tableCells);
t.getObjects().forEach(o -> {
TableExtractorCells objectCell = new TableExtractorCells();
objectCell.setX0(t.getTable().getBbox().get(0));
objectCell.setX1(t.getTable().getBbox().get(2));
objectCell.setY0(t.getTable().getBbox().get(1));
objectCell.setY1(t.getTable().getBbox().get(3));
objectCell.setWidth(objectCell.getX1()- objectCell.getX0());
objectCell.setHeight(objectCell.getY1()- objectCell.getY0());
objectCell.setLabel(o.getLabel());
log.info("Parsed object cell {} with label {}",objectCell, objectCell.getLabel());
parsedTableCells.add(objectCell);
});
});
return parsedTableCells;
}
}

View File

@ -0,0 +1,20 @@
package com.knecon.fforesight.service.layoutparser.processor.python_api.model.table;
import java.util.List;
import lombok.AllArgsConstructor;
import lombok.Builder;
import lombok.Data;
import lombok.NoArgsConstructor;
@Data
@Builder
@NoArgsConstructor
@AllArgsConstructor
public class ExtractedTable {
private boolean rotated;
private ExtractedTableData table;
private List<ExtractedTableData> objects;
}

View File

@ -0,0 +1,20 @@
package com.knecon.fforesight.service.layoutparser.processor.python_api.model.table;
import java.util.List;
import lombok.AllArgsConstructor;
import lombok.Builder;
import lombok.Data;
import lombok.NoArgsConstructor;
@Data
@Builder
@NoArgsConstructor
@AllArgsConstructor
public class ExtractedTableData {
private String label;
private float score;
private List<Float> bbox;
}

View File

@ -0,0 +1,21 @@
package com.knecon.fforesight.service.layoutparser.processor.python_api.model.table;
import lombok.AllArgsConstructor;
import lombok.Builder;
import lombok.Data;
import lombok.NoArgsConstructor;
@Data
@Builder
@NoArgsConstructor
@AllArgsConstructor
public class TableExtractorCells {
private float x0;
private float y0;
private float x1;
private float y1;
private float width;
private float height;
private String label;
}

View File

@ -0,0 +1,19 @@
package com.knecon.fforesight.service.layoutparser.processor.python_api.model.table;
import java.util.List;
import lombok.AllArgsConstructor;
import lombok.Builder;
import lombok.Data;
import lombok.NoArgsConstructor;
@Data
@Builder
@NoArgsConstructor
@AllArgsConstructor
public class TableExtractorData {
private int page_number;
private int image;
private List<ExtractedTable> tables;
}

View File

@ -0,0 +1,23 @@
package com.knecon.fforesight.service.layoutparser.processor.python_api.model.table;
import java.util.List;
import lombok.AllArgsConstructor;
import lombok.Builder;
import lombok.Data;
import lombok.NoArgsConstructor;
@Data
@Builder
@NoArgsConstructor
@AllArgsConstructor
public class TableExtractorResponse {
private String dossierId;
private String fileId;
private String targetFileExtension;
private String responseFileExtension;
private String X_TENANT_ID;
private List<TableExtractorData> data;
}

View File

@ -25,6 +25,7 @@ public class BodyTextFrameService {
private static final float RULING_HEIGHT_THRESHOLD = 0.15f; // multiplied with page height. Header/Footer Rulings must be within that border of the page. private static final float RULING_HEIGHT_THRESHOLD = 0.15f; // multiplied with page height. Header/Footer Rulings must be within that border of the page.
private static final float RULING_WIDTH_THRESHOLD = 0.75f; // multiplied with page width. Header/Footer Rulings must be at least that wide. private static final float RULING_WIDTH_THRESHOLD = 0.75f; // multiplied with page width. Header/Footer Rulings must be at least that wide.
public void setBodyTextFrames(ClassificationDocument classificationDocument, LayoutParsingType layoutParsingType) { public void setBodyTextFrames(ClassificationDocument classificationDocument, LayoutParsingType layoutParsingType) {
Rectangle bodyTextFrame = calculateBodyTextFrame(classificationDocument.getPages(), classificationDocument.getFontSizeCounter(), false, layoutParsingType); Rectangle bodyTextFrame = calculateBodyTextFrame(classificationDocument.getPages(), classificationDocument.getFontSizeCounter(), false, layoutParsingType);
@ -155,8 +156,9 @@ public class BodyTextFrameService {
continue; continue;
} }
if (MarkedContentUtils.intersects(textBlock, page.getMarkedContentBboxPerType(), MarkedContentUtils.HEADER) if (MarkedContentUtils.intersects(textBlock, page.getMarkedContentBboxPerType(), MarkedContentUtils.HEADER) || MarkedContentUtils.intersects(textBlock,
|| MarkedContentUtils.intersects(textBlock, page.getMarkedContentBboxPerType(), MarkedContentUtils.FOOTER)) { page.getMarkedContentBboxPerType(),
MarkedContentUtils.FOOTER)) {
continue; continue;
} }

View File

@ -22,7 +22,6 @@ public class DividingColumnDetectionService {
public List<Rectangle2D> detectColumns(PageContents pageContents) { public List<Rectangle2D> detectColumns(PageContents pageContents) {
if (pageContents.getSortedTextPositionSequences().size() < 2) { if (pageContents.getSortedTextPositionSequences().size() < 2) {
return List.of(pageContents.getCropBox()); return List.of(pageContents.getCropBox());
} }

View File

@ -72,11 +72,13 @@ public class GapDetectionService {
return mirrorY(RectangleTransformations.toRectangle2D(textPosition.getRectangle())); return mirrorY(RectangleTransformations.toRectangle2D(textPosition.getRectangle()));
} }
private static Rectangle2D mirrorY(Rectangle2D rectangle2D) { private static Rectangle2D mirrorY(Rectangle2D rectangle2D) {
return new Rectangle2D.Double(rectangle2D.getX(), Math.min(rectangle2D.getMinY(), rectangle2D.getMaxY()), rectangle2D.getWidth(), Math.abs(rectangle2D.getHeight())); return new Rectangle2D.Double(rectangle2D.getX(), Math.min(rectangle2D.getMinY(), rectangle2D.getMaxY()), rectangle2D.getWidth(), Math.abs(rectangle2D.getHeight()));
} }
private static void addGapToLine(Rectangle2D currentTextPosition, Rectangle2D previousTextPosition, XGapsContext context) { private static void addGapToLine(Rectangle2D currentTextPosition, Rectangle2D previousTextPosition, XGapsContext context) {
context.gapsInCurrentLine.add(new Rectangle2D.Double(previousTextPosition.getMaxX(), context.gapsInCurrentLine.add(new Rectangle2D.Double(previousTextPosition.getMaxX(),

View File

@ -6,7 +6,6 @@ import java.util.LinkedList;
import java.util.List; import java.util.List;
import java.util.Queue; import java.util.Queue;
import java.util.stream.Stream; import java.util.stream.Stream;
import com.iqser.red.commons.jackson.ObjectMapperFactory;
import com.knecon.fforesight.service.layoutparser.processor.model.GapInformation; import com.knecon.fforesight.service.layoutparser.processor.model.GapInformation;
@ -51,7 +50,9 @@ public class GapsAcrossLinesService {
} }
return columnFactory.outputGaps.stream() return columnFactory.outputGaps.stream()
.filter(gapAcrossLines -> columnFactory.outputGaps.stream().filter(gapAcrossLines::intersectsX).noneMatch(gapAcrossLines1 -> gapAcrossLines1.lineCount > gapAcrossLines.lineCount)) .filter(gapAcrossLines -> columnFactory.outputGaps.stream()
.filter(gapAcrossLines::intersectsX)
.noneMatch(gapAcrossLines1 -> gapAcrossLines1.lineCount > gapAcrossLines.lineCount))
.filter(gapAcrossLines -> Math.abs(gapAcrossLines.rectangle2D.getMinX() - mainBodyTextFrame.getMinX()) > DISTANCE_TO_BORDER_THRESHOLD) .filter(gapAcrossLines -> Math.abs(gapAcrossLines.rectangle2D.getMinX() - mainBodyTextFrame.getMinX()) > DISTANCE_TO_BORDER_THRESHOLD)
.filter(gapAcrossLines -> Math.abs(gapAcrossLines.rectangle2D.getMaxX() - mainBodyTextFrame.getMaxX()) > DISTANCE_TO_BORDER_THRESHOLD) .filter(gapAcrossLines -> Math.abs(gapAcrossLines.rectangle2D.getMaxX() - mainBodyTextFrame.getMaxX()) > DISTANCE_TO_BORDER_THRESHOLD)
.map(GapAcrossLines::getRectangle2D) .map(GapAcrossLines::getRectangle2D)

View File

@ -6,8 +6,8 @@ import java.util.List;
import com.knecon.fforesight.service.layoutparser.processor.model.GapInformation; import com.knecon.fforesight.service.layoutparser.processor.model.GapInformation;
import com.knecon.fforesight.service.layoutparser.processor.model.LineInformation; import com.knecon.fforesight.service.layoutparser.processor.model.LineInformation;
import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence; import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations;
import lombok.AllArgsConstructor; import lombok.AllArgsConstructor;
import lombok.Getter; import lombok.Getter;

View File

@ -16,8 +16,7 @@ public class MainBodyTextFrameExtractionService {
public Rectangle2D calculateMainBodyTextFrame(LineInformation lineInformation) { public Rectangle2D calculateMainBodyTextFrame(LineInformation lineInformation) {
Rectangle2D mainBodyTextFrame = lineInformation.getLineBBox().stream() Rectangle2D mainBodyTextFrame = lineInformation.getLineBBox().stream().collect(RectangleTransformations.collectBBox());
.collect(RectangleTransformations.collectBBox());
return RectangleTransformations.pad(mainBodyTextFrame, mainBodyTextFrame.getWidth() * TEXT_FRAME_PAD_WIDTH, mainBodyTextFrame.getHeight() * TEXT_FRAME_PAD_HEIGHT); return RectangleTransformations.pad(mainBodyTextFrame, mainBodyTextFrame.getWidth() * TEXT_FRAME_PAD_WIDTH, mainBodyTextFrame.getHeight() * TEXT_FRAME_PAD_HEIGHT);
} }

View File

@ -52,7 +52,7 @@ public class PageContentExtractor {
stripper.getRulings())); stripper.getRulings()));
} }
} }
return textPositionSequencesPerPage; return textPositionSequencesPerPage;
} }

View File

@ -5,9 +5,9 @@ import java.util.List;
import org.springframework.stereotype.Service; import org.springframework.stereotype.Service;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.SimplifiedSectionText; import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.SimplifiedSectionText;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.SimplifiedText;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document; import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Section; import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Section;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.SimplifiedText;
@Service @Service
public class SimplifiedSectionTextService { public class SimplifiedSectionTextService {
@ -23,4 +23,5 @@ public class SimplifiedSectionTextService {
return SimplifiedSectionText.builder().sectionNumber(section.getTreeId().get(0)).text(section.getTextBlock().getSearchText()).build(); return SimplifiedSectionText.builder().sectionNumber(section.getTreeId().get(0)).text(section.getTextBlock().getSearchText()).build();
} }
} }

View File

@ -1,9 +1,20 @@
package com.knecon.fforesight.service.layoutparser.processor.services.blockification; package com.knecon.fforesight.service.layoutparser.processor.services.blockification;
// TODO: figure out, why this fails the build // TODO: figure out, why this fails the build
// import static com.knecon.fforesight.service.layoutparser.processor.services.factory.SearchTextWithTextPositionFactory.HEIGHT_PADDING; // import static com.knecon.fforesight.service.layoutparser.processor.services.factory.SearchTextWithTextPositionFactory.HEIGHT_PADDING;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.stream.Stream;
import org.springframework.stereotype.Service;
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock; import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage; import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
import com.knecon.fforesight.service.layoutparser.processor.model.Orientation; import com.knecon.fforesight.service.layoutparser.processor.model.Orientation;
@ -11,12 +22,6 @@ import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock; import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence; import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
import com.knecon.fforesight.service.layoutparser.processor.utils.RulingTextDirAdjustUtil; import com.knecon.fforesight.service.layoutparser.processor.utils.RulingTextDirAdjustUtil;
import org.springframework.stereotype.Service;
import java.util.*;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.stream.Stream;
@Service @Service
@SuppressWarnings("all") @SuppressWarnings("all")
@ -83,13 +88,13 @@ public class TaasBlockificationService {
continue; continue;
} }
Matcher listIdentifierPattern = listIdentifier.matcher(currentTextBlock.getText()); Matcher listIdentifierPattern = listIdentifier.matcher(currentTextBlock.getText());
boolean isListIdentifier = listIdentifierPattern.find(); boolean isListIdentifier = listIdentifierPattern.find();
boolean yGap = Math.abs(currentTextBlock.getPdfMaxY() - previousTextBlock.getPdfMinY()) < previousTextBlock.getMostPopularWordHeight() * Y_GAP_SPLIT_HEIGHT_MODIFIER; boolean yGap = Math.abs(currentTextBlock.getPdfMaxY() - previousTextBlock.getPdfMinY()) < previousTextBlock.getMostPopularWordHeight() * Y_GAP_SPLIT_HEIGHT_MODIFIER;
boolean sameFont = previousTextBlock.getMostPopularWordFont().equals(currentTextBlock.getMostPopularWordFont()) && previousTextBlock.getMostPopularWordFontSize() == currentTextBlock.getMostPopularWordFontSize(); boolean sameFont = previousTextBlock.getMostPopularWordFont()
.equals(currentTextBlock.getMostPopularWordFont()) && previousTextBlock.getMostPopularWordFontSize() == currentTextBlock.getMostPopularWordFontSize();
// boolean yGap = previousTextBlock != null && currentTextBlock.getMinYDirAdj() - maxY > Math.min(word.getHeight(), prev.getHeight()) * Y_GAP_SPLIT_HEIGHT_MODIFIER; // boolean yGap = previousTextBlock != null && currentTextBlock.getMinYDirAdj() - maxY > Math.min(word.getHeight(), prev.getHeight()) * Y_GAP_SPLIT_HEIGHT_MODIFIER;
boolean alignsXRight = Math.abs(currentTextBlock.getPdfMaxX() - previousTextBlock.getPdfMaxX()) < X_ALIGNMENT_THRESHOLD; boolean alignsXRight = Math.abs(currentTextBlock.getPdfMaxX() - previousTextBlock.getPdfMaxX()) < X_ALIGNMENT_THRESHOLD;
@ -119,8 +124,9 @@ public class TaasBlockificationService {
} }
alreadyMerged.add(textPageBlock); alreadyMerged.add(textPageBlock);
textBlocksToMerge.add(Stream.concat(Stream.of(textPageBlock), textBlocksToMerge.add(Stream.concat(Stream.of(textPageBlock),
textPageBlocks.stream().filter(textPageBlock2 -> textPageBlock.almostIntersects(textPageBlock2, INTERSECTS_Y_THRESHOLD, 0) && !alreadyMerged.contains(textPageBlock2)).peek(alreadyMerged::add)) textPageBlocks.stream()
.toList()); .filter(textPageBlock2 -> textPageBlock.almostIntersects(textPageBlock2, INTERSECTS_Y_THRESHOLD, 0) && !alreadyMerged.contains(textPageBlock2))
.peek(alreadyMerged::add)).toList());
} }
return textBlocksToMerge.stream().map(TextPageBlock::merge).toList(); return textBlocksToMerge.stream().map(TextPageBlock::merge).toList();
} }
@ -163,8 +169,7 @@ public class TaasBlockificationService {
while (itty.hasNext()) { while (itty.hasNext()) {
TextPageBlock block = (TextPageBlock) itty.next(); TextPageBlock block = (TextPageBlock) itty.next();
if (previous != null && previous.getOrientation().equals(Orientation.LEFT) && block.getOrientation().equals(Orientation.LEFT) && equalsWithThreshold( if (previous != null && previous.getOrientation().equals(Orientation.LEFT) && block.getOrientation().equals(Orientation.LEFT) && equalsWithThreshold(block.getMaxY(),
block.getMaxY(),
previous.getMaxY()) || previous != null && previous.getOrientation().equals(Orientation.LEFT) && block.getOrientation() previous.getMaxY()) || previous != null && previous.getOrientation().equals(Orientation.LEFT) && block.getOrientation()
.equals(Orientation.RIGHT) && equalsWithThreshold(block.getMaxY(), previous.getMaxY())) { .equals(Orientation.RIGHT) && equalsWithThreshold(block.getMaxY(), previous.getMaxY())) {
previous.add(block); previous.add(block);
@ -189,7 +194,6 @@ public class TaasBlockificationService {
TextPositionSequence prev = null; TextPositionSequence prev = null;
// TODO: make static final constant // TODO: make static final constant
boolean wasSplitted = false; boolean wasSplitted = false;
Float splitX1 = null; Float splitX1 = null;
for (TextPositionSequence word : textPositions) { for (TextPositionSequence word : textPositions) {

View File

@ -5,7 +5,6 @@ import java.util.Locale;
import java.util.regex.Matcher; import java.util.regex.Matcher;
import java.util.regex.Pattern; import java.util.regex.Pattern;
import com.knecon.fforesight.service.layoutparser.processor.utils.MarkedContentUtils;
import org.springframework.stereotype.Service; import org.springframework.stereotype.Service;
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock; import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
@ -13,6 +12,7 @@ import com.knecon.fforesight.service.layoutparser.processor.model.Classification
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage; import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
import com.knecon.fforesight.service.layoutparser.processor.model.PageBlockType; import com.knecon.fforesight.service.layoutparser.processor.model.PageBlockType;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock; import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.utils.MarkedContentUtils;
import com.knecon.fforesight.service.layoutparser.processor.utils.PositionUtils; import com.knecon.fforesight.service.layoutparser.processor.utils.PositionUtils;
import lombok.RequiredArgsConstructor; import lombok.RequiredArgsConstructor;
@ -63,16 +63,16 @@ public class DocuMineClassificationService {
textBlock.setClassification(PageBlockType.OTHER); textBlock.setClassification(PageBlockType.OTHER);
return; return;
} }
if (MarkedContentUtils.intersects(textBlock, page.getMarkedContentBboxPerType(), MarkedContentUtils.HEADER) if (MarkedContentUtils.intersects(textBlock, page.getMarkedContentBboxPerType(), MarkedContentUtils.HEADER) || PositionUtils.isOverBodyTextFrame(bodyTextFrame,
|| PositionUtils.isOverBodyTextFrame(bodyTextFrame, textBlock, page.getRotation()) && (document.getFontSizeCounter() textBlock,
.getMostPopular() == null || textBlock.getHighestFontSize() <= document.getFontSizeCounter().getMostPopular()) page.getRotation()) && (document.getFontSizeCounter().getMostPopular() == null || textBlock.getHighestFontSize() <= document.getFontSizeCounter()
) { .getMostPopular())) {
textBlock.setClassification(PageBlockType.HEADER); textBlock.setClassification(PageBlockType.HEADER);
} else if (MarkedContentUtils.intersects(textBlock, page.getMarkedContentBboxPerType(), MarkedContentUtils.FOOTER) } else if (MarkedContentUtils.intersects(textBlock, page.getMarkedContentBboxPerType(), MarkedContentUtils.FOOTER) || PositionUtils.isUnderBodyTextFrame(bodyTextFrame,
|| PositionUtils.isUnderBodyTextFrame(bodyTextFrame, textBlock, page.getRotation()) && (document.getFontSizeCounter() textBlock,
.getMostPopular() == null || textBlock.getHighestFontSize() <= document.getFontSizeCounter().getMostPopular()) page.getRotation()) && (document.getFontSizeCounter().getMostPopular() == null || textBlock.getHighestFontSize() <= document.getFontSizeCounter()
) { .getMostPopular())) {
textBlock.setClassification(PageBlockType.FOOTER); textBlock.setClassification(PageBlockType.FOOTER);
} else if (page.getPageNumber() == 1 && (PositionUtils.getHeightDifferenceBetweenChunkWordAndDocumentWord(textBlock, } else if (page.getPageNumber() == 1 && (PositionUtils.getHeightDifferenceBetweenChunkWordAndDocumentWord(textBlock,
document.getTextHeightCounter().getMostPopular()) > 2.5 && textBlock.getHighestFontSize() > document.getFontSizeCounter().getMostPopular() || page.getTextBlocks() document.getTextHeightCounter().getMostPopular()) > 2.5 && textBlock.getHighestFontSize() > document.getFontSizeCounter().getMostPopular() || page.getTextBlocks()

View File

@ -3,7 +3,6 @@ package com.knecon.fforesight.service.layoutparser.processor.services.classifica
import java.util.List; import java.util.List;
import java.util.regex.Pattern; import java.util.regex.Pattern;
import com.knecon.fforesight.service.layoutparser.processor.utils.MarkedContentUtils;
import org.springframework.stereotype.Service; import org.springframework.stereotype.Service;
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock; import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
@ -11,6 +10,7 @@ import com.knecon.fforesight.service.layoutparser.processor.model.Classification
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage; import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
import com.knecon.fforesight.service.layoutparser.processor.model.PageBlockType; import com.knecon.fforesight.service.layoutparser.processor.model.PageBlockType;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock; import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.utils.MarkedContentUtils;
import com.knecon.fforesight.service.layoutparser.processor.utils.PositionUtils; import com.knecon.fforesight.service.layoutparser.processor.utils.PositionUtils;
import lombok.RequiredArgsConstructor; import lombok.RequiredArgsConstructor;
@ -21,7 +21,6 @@ import lombok.extern.slf4j.Slf4j;
@RequiredArgsConstructor @RequiredArgsConstructor
public class RedactManagerClassificationService { public class RedactManagerClassificationService {
public void classifyDocument(ClassificationDocument document) { public void classifyDocument(ClassificationDocument document) {
List<Float> headlineFontSizes = document.getFontSizeCounter().getHighterThanMostPopular(); List<Float> headlineFontSizes = document.getFontSizeCounter().getHighterThanMostPopular();
@ -52,14 +51,16 @@ public class RedactManagerClassificationService {
textBlock.setClassification(PageBlockType.OTHER); textBlock.setClassification(PageBlockType.OTHER);
return; return;
} }
if (MarkedContentUtils.intersects(textBlock, page.getMarkedContentBboxPerType(), MarkedContentUtils.HEADER) if (MarkedContentUtils.intersects(textBlock, page.getMarkedContentBboxPerType(), MarkedContentUtils.HEADER) || PositionUtils.isOverBodyTextFrame(bodyTextFrame,
|| PositionUtils.isOverBodyTextFrame(bodyTextFrame, textBlock, page.getRotation()) && (document.getFontSizeCounter() textBlock,
.getMostPopular() == null || textBlock.getHighestFontSize() <= document.getFontSizeCounter().getMostPopular())) { page.getRotation()) && (document.getFontSizeCounter().getMostPopular() == null || textBlock.getHighestFontSize() <= document.getFontSizeCounter()
.getMostPopular())) {
textBlock.setClassification(PageBlockType.HEADER); textBlock.setClassification(PageBlockType.HEADER);
} else if (MarkedContentUtils.intersects(textBlock, page.getMarkedContentBboxPerType(), MarkedContentUtils.FOOTER) } else if (MarkedContentUtils.intersects(textBlock, page.getMarkedContentBboxPerType(), MarkedContentUtils.FOOTER) || PositionUtils.isUnderBodyTextFrame(bodyTextFrame,
|| PositionUtils.isUnderBodyTextFrame(bodyTextFrame, textBlock, page.getRotation()) && (document.getFontSizeCounter() textBlock,
.getMostPopular() == null || textBlock.getHighestFontSize() <= document.getFontSizeCounter().getMostPopular())) { page.getRotation()) && (document.getFontSizeCounter().getMostPopular() == null || textBlock.getHighestFontSize() <= document.getFontSizeCounter()
.getMostPopular())) {
textBlock.setClassification(PageBlockType.FOOTER); textBlock.setClassification(PageBlockType.FOOTER);
} else if (page.getPageNumber() == 1 && (PositionUtils.getHeightDifferenceBetweenChunkWordAndDocumentWord(textBlock, } else if (page.getPageNumber() == 1 && (PositionUtils.getHeightDifferenceBetweenChunkWordAndDocumentWord(textBlock,
document.getTextHeightCounter().getMostPopular()) > 2.5 && textBlock.getHighestFontSize() > document.getFontSizeCounter().getMostPopular() || page.getTextBlocks() document.getTextHeightCounter().getMostPopular()) > 2.5 && textBlock.getHighestFontSize() > document.getFontSizeCounter().getMostPopular() || page.getTextBlocks()

View File

@ -3,7 +3,6 @@ package com.knecon.fforesight.service.layoutparser.processor.services.classifica
import java.util.List; import java.util.List;
import java.util.regex.Pattern; import java.util.regex.Pattern;
import com.knecon.fforesight.service.layoutparser.processor.utils.MarkedContentUtils;
import org.springframework.stereotype.Service; import org.springframework.stereotype.Service;
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock; import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
@ -12,6 +11,7 @@ import com.knecon.fforesight.service.layoutparser.processor.model.Classification
import com.knecon.fforesight.service.layoutparser.processor.model.PageBlockType; import com.knecon.fforesight.service.layoutparser.processor.model.PageBlockType;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock; import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.services.BodyTextFrameService; import com.knecon.fforesight.service.layoutparser.processor.services.BodyTextFrameService;
import com.knecon.fforesight.service.layoutparser.processor.utils.MarkedContentUtils;
import com.knecon.fforesight.service.layoutparser.processor.utils.PositionUtils; import com.knecon.fforesight.service.layoutparser.processor.utils.PositionUtils;
import lombok.RequiredArgsConstructor; import lombok.RequiredArgsConstructor;
@ -27,7 +27,6 @@ public class TaasClassificationService {
public void classifyDocument(ClassificationDocument document) { public void classifyDocument(ClassificationDocument document) {
List<Float> headlineFontSizes = document.getFontSizeCounter().getHighterThanMostPopular(); List<Float> headlineFontSizes = document.getFontSizeCounter().getHighterThanMostPopular();
log.debug("Document FontSize counters are: {}", document.getFontSizeCounter().getCountPerValue()); log.debug("Document FontSize counters are: {}", document.getFontSizeCounter().getCountPerValue());
@ -57,11 +56,13 @@ public class TaasClassificationService {
textBlock.setClassification(PageBlockType.OTHER); textBlock.setClassification(PageBlockType.OTHER);
return; return;
} }
if (MarkedContentUtils.intersects(textBlock, page.getMarkedContentBboxPerType(), MarkedContentUtils.HEADER) if (MarkedContentUtils.intersects(textBlock, page.getMarkedContentBboxPerType(), MarkedContentUtils.HEADER) || PositionUtils.isOverBodyTextFrame(bodyTextFrame,
|| PositionUtils.isOverBodyTextFrame(bodyTextFrame, textBlock, page.getRotation())) { textBlock,
page.getRotation())) {
textBlock.setClassification(PageBlockType.HEADER); textBlock.setClassification(PageBlockType.HEADER);
} else if (MarkedContentUtils.intersects(textBlock, page.getMarkedContentBboxPerType(), MarkedContentUtils.FOOTER) } else if (MarkedContentUtils.intersects(textBlock, page.getMarkedContentBboxPerType(), MarkedContentUtils.FOOTER) || PositionUtils.isUnderBodyTextFrame(bodyTextFrame,
|| PositionUtils.isUnderBodyTextFrame(bodyTextFrame, textBlock, page.getRotation())) { textBlock,
page.getRotation())) {
textBlock.setClassification(PageBlockType.FOOTER); textBlock.setClassification(PageBlockType.FOOTER);
} else if (page.getPageNumber() == 1 && (PositionUtils.getHeightDifferenceBetweenChunkWordAndDocumentWord(textBlock, } else if (page.getPageNumber() == 1 && (PositionUtils.getHeightDifferenceBetweenChunkWordAndDocumentWord(textBlock,
document.getTextHeightCounter().getMostPopular()) > 2.5 && textBlock.getHighestFontSize() > document.getFontSizeCounter().getMostPopular() || page.getTextBlocks() document.getTextHeightCounter().getMostPopular()) > 2.5 && textBlock.getHighestFontSize() > document.getFontSizeCounter().getMostPopular() || page.getTextBlocks()

View File

@ -18,8 +18,6 @@ import com.knecon.fforesight.service.layoutparser.processor.model.Classification
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationFooter; import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationFooter;
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationHeader; import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationHeader;
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage; import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
import com.knecon.fforesight.service.layoutparser.processor.model.image.ClassifiedImage;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.DocumentTree; import com.knecon.fforesight.service.layoutparser.processor.model.graph.DocumentTree;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document; import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Footer; import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Footer;
@ -31,6 +29,8 @@ import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Pa
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Paragraph; import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Paragraph;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Section; import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Section;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.AtomicTextBlock; import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.AtomicTextBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.image.ClassifiedImage;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.utils.IdBuilder; import com.knecon.fforesight.service.layoutparser.processor.utils.IdBuilder;
import com.knecon.fforesight.service.layoutparser.processor.utils.TextPositionOperations; import com.knecon.fforesight.service.layoutparser.processor.utils.TextPositionOperations;

View File

@ -8,10 +8,10 @@ import java.util.List;
import java.util.Locale; import java.util.Locale;
import java.util.Objects; import java.util.Objects;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.Boundary;
import com.knecon.fforesight.service.layoutparser.processor.model.text.RedTextPosition; import com.knecon.fforesight.service.layoutparser.processor.model.text.RedTextPosition;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextDirection; import com.knecon.fforesight.service.layoutparser.processor.model.text.TextDirection;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence; import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.Boundary;
import lombok.experimental.UtilityClass; import lombok.experimental.UtilityClass;
@ -110,6 +110,7 @@ public class SearchTextWithTextPositionFactory {
return context.stringIdx - context.lastHyphenIdx < MAX_HYPHEN_LINEBREAK_DISTANCE; return context.stringIdx - context.lastHyphenIdx < MAX_HYPHEN_LINEBREAK_DISTANCE;
} }
private static List<Boundary> mergeToBoundaries(List<Integer> integers) { private static List<Boundary> mergeToBoundaries(List<Integer> integers) {
if (integers.isEmpty()) { if (integers.isEmpty()) {
@ -125,8 +126,9 @@ public class SearchTextWithTextPositionFactory {
} }
end = current + 1; end = current + 1;
} }
if (boundaries.isEmpty()) if (boundaries.isEmpty()) {
boundaries.add(new Boundary(start, end)); boundaries.add(new Boundary(start, end));
}
return boundaries; return boundaries;
} }
@ -138,6 +140,7 @@ public class SearchTextWithTextPositionFactory {
} }
} }
private boolean isLineBreak(RedTextPosition currentTextPosition, RedTextPosition previousTextPosition) { private boolean isLineBreak(RedTextPosition currentTextPosition, RedTextPosition previousTextPosition) {
return Objects.equals(currentTextPosition.getUnicode(), "\n") || isDeltaYLargerThanTextHeight(currentTextPosition, previousTextPosition); return Objects.equals(currentTextPosition.getUnicode(), "\n") || isDeltaYLargerThanTextHeight(currentTextPosition, previousTextPosition);
@ -163,17 +166,7 @@ public class SearchTextWithTextPositionFactory {
private boolean isHyphen(String unicodeCharacter) { private boolean isHyphen(String unicodeCharacter) {
return Objects.equals(unicodeCharacter, "-") || // return false;
Objects.equals(unicodeCharacter, "~") || //
Objects.equals(unicodeCharacter, "") || //
Objects.equals(unicodeCharacter, "") || //
Objects.equals(unicodeCharacter, "") || //
Objects.equals(unicodeCharacter, "") || //
Objects.equals(unicodeCharacter, "") || //
Objects.equals(unicodeCharacter, "") || //
Objects.equals(unicodeCharacter, "") || //
Objects.equals(unicodeCharacter, "") || //
Objects.equals(unicodeCharacter, "\u00AD");
} }

View File

@ -11,12 +11,12 @@ import java.util.Map;
import java.util.Set; import java.util.Set;
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock; import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.image.ClassifiedImage;
import com.knecon.fforesight.service.layoutparser.processor.model.table.TablePageBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.GenericSemanticNode; import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.GenericSemanticNode;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Page; import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Page;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Section; import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Section;
import com.knecon.fforesight.service.layoutparser.processor.model.image.ClassifiedImage;
import com.knecon.fforesight.service.layoutparser.processor.model.table.TablePageBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.utils.TableMergingUtility; import com.knecon.fforesight.service.layoutparser.processor.utils.TableMergingUtility;
import lombok.experimental.UtilityClass; import lombok.experimental.UtilityClass;

View File

@ -8,15 +8,15 @@ import java.util.Set;
import java.util.stream.Collectors; import java.util.stream.Collectors;
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock; import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.table.Cell;
import com.knecon.fforesight.service.layoutparser.processor.model.table.TablePageBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.GenericSemanticNode; import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.GenericSemanticNode;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Page; import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Page;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.SemanticNode; import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.SemanticNode;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Table; import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Table;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.TableCell; import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.TableCell;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlock; import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.table.Cell;
import com.knecon.fforesight.service.layoutparser.processor.model.table.TablePageBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
import com.knecon.fforesight.service.layoutparser.processor.utils.TextPositionOperations; import com.knecon.fforesight.service.layoutparser.processor.utils.TextPositionOperations;
import lombok.experimental.UtilityClass; import lombok.experimental.UtilityClass;

View File

@ -2,10 +2,10 @@ package com.knecon.fforesight.service.layoutparser.processor.services.factory;
import java.util.List; import java.util.List;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Page; import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Page;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.SemanticNode; import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.SemanticNode;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.AtomicTextBlock; import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.AtomicTextBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
import lombok.AccessLevel; import lombok.AccessLevel;
import lombok.experimental.FieldDefaults; import lombok.experimental.FieldDefaults;

View File

@ -7,11 +7,11 @@ import java.util.List;
import java.util.Map; import java.util.Map;
import java.util.NoSuchElementException; import java.util.NoSuchElementException;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentPositionData;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentTextData;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentData; import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentData;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentStructure;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentPage; import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentPage;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentPositionData;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentStructure;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentTextData;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.DocumentTree; import com.knecon.fforesight.service.layoutparser.processor.model.graph.DocumentTree;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document; import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Footer; import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Footer;

View File

@ -1,7 +1,6 @@
package com.knecon.fforesight.service.layoutparser.processor.services.mapper; package com.knecon.fforesight.service.layoutparser.processor.services.mapper;
import java.awt.geom.Rectangle2D; import java.awt.geom.Rectangle2D;
import java.util.Collections;
import java.util.HashMap; import java.util.HashMap;
import java.util.Locale; import java.util.Locale;
import java.util.Map; import java.util.Map;
@ -9,7 +8,6 @@ import java.util.Map;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentStructure; import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentStructure;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Image; import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Image;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.ImageType; import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.ImageType;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Page;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Table; import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Table;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.TableCell; import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.TableCell;

View File

@ -329,6 +329,7 @@ public class PDFLinesTextStripper extends PDFTextStripper {
.getXDirAdj() - (previous.getXDirAdj() + previous.getWidthDirAdj()) < maximumGapSize; .getXDirAdj() - (previous.getXDirAdj() + previous.getWidthDirAdj()) < maximumGapSize;
} }
@Override @Override
public String getText(PDDocument doc) throws IOException { public String getText(PDDocument doc) throws IOException {

View File

@ -25,10 +25,23 @@ import java.io.StringWriter;
import java.io.Writer; import java.io.Writer;
import java.text.Bidi; import java.text.Bidi;
import java.text.Normalizer; import java.text.Normalizer;
import java.util.*; import java.util.ArrayDeque;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.Deque;
import java.util.HashMap;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.SortedMap;
import java.util.SortedSet;
import java.util.StringTokenizer;
import java.util.TreeMap;
import java.util.TreeSet;
import java.util.regex.Pattern; import java.util.regex.Pattern;
import lombok.Getter;
import org.apache.commons.logging.Log; import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory; import org.apache.commons.logging.LogFactory;
import org.apache.pdfbox.cos.COSDictionary; import org.apache.pdfbox.cos.COSDictionary;
@ -46,6 +59,8 @@ import org.apache.pdfbox.text.TextPositionComparator;
import com.knecon.fforesight.service.layoutparser.processor.utils.QuickSort; import com.knecon.fforesight.service.layoutparser.processor.utils.QuickSort;
import lombok.Getter;
/** /**
* This is just a copy except i only adjusted lines 594-607 cause this is a bug in Pdfbox. * This is just a copy except i only adjusted lines 594-607 cause this is a bug in Pdfbox.
* see S416.pdf * see S416.pdf
@ -194,40 +209,33 @@ public class PDFTextStripper extends LegacyPDFStreamEngine {
} }
public void beginMarkedContentSequence(COSName tag, COSDictionary properties) {
public void beginMarkedContentSequence(COSName tag, COSDictionary properties)
{
PDMarkedContent markedContent = PDMarkedContent.create(tag, properties); PDMarkedContent markedContent = PDMarkedContent.create(tag, properties);
if (this.currentMarkedContents.isEmpty()) if (this.currentMarkedContents.isEmpty()) {
{
this.markedContents.add(markedContent); this.markedContents.add(markedContent);
} } else {
else PDMarkedContent currentMarkedContent = this.currentMarkedContents.peek();
{ if (currentMarkedContent != null) {
PDMarkedContent currentMarkedContent =
this.currentMarkedContents.peek();
if (currentMarkedContent != null)
{
currentMarkedContent.addMarkedContent(markedContent); currentMarkedContent.addMarkedContent(markedContent);
} }
} }
this.currentMarkedContents.push(markedContent); this.currentMarkedContents.push(markedContent);
} }
@Override @Override
public void endMarkedContentSequence() public void endMarkedContentSequence() {
{
if (!this.currentMarkedContents.isEmpty()) if (!this.currentMarkedContents.isEmpty()) {
{
this.currentMarkedContents.pop(); this.currentMarkedContents.pop();
} }
} }
public void xobject(PDXObject xobject) public void xobject(PDXObject xobject) {
{
if (!this.currentMarkedContents.isEmpty()) if (!this.currentMarkedContents.isEmpty()) {
{
this.currentMarkedContents.peek().addXObject(xobject); this.currentMarkedContents.peek().addXObject(xobject);
} }
} }
@ -635,7 +643,6 @@ public class PDFTextStripper extends LegacyPDFStreamEngine {
var normalized = normalize(line); var normalized = normalize(line);
// normalized.stream().filter(l -> System.out.println(l.getText().contains("Plenarprotokoll 20/24")).findFirst().isPresent() // normalized.stream().filter(l -> System.out.println(l.getText().contains("Plenarprotokoll 20/24")).findFirst().isPresent()
lastLineStartPosition = handleLineSeparation(current, lastPosition, lastLineStartPosition, maxHeightForLine); lastLineStartPosition = handleLineSeparation(current, lastPosition, lastLineStartPosition, maxHeightForLine);
writeLine(normalized, current.isParagraphStart); writeLine(normalized, current.isParagraphStart);
line.clear(); line.clear();
@ -914,8 +921,7 @@ public class PDFTextStripper extends LegacyPDFStreamEngine {
textList.add(text); textList.add(text);
} }
} }
if (!this.currentMarkedContents.isEmpty()) if (!this.currentMarkedContents.isEmpty()) {
{
this.currentMarkedContents.peek().addText(text); this.currentMarkedContents.peek().addText(text);
} }
} }
@ -2102,7 +2108,9 @@ public class PDFTextStripper extends LegacyPDFStreamEngine {
return endParagraphWritten; return endParagraphWritten;
} }
public void setEndParagraphWritten(){
public void setEndParagraphWritten() {
endParagraphWritten = true; endParagraphWritten = true;
} }
@ -2145,7 +2153,6 @@ public class PDFTextStripper extends LegacyPDFStreamEngine {
this.isHangingIndent = true; this.isHangingIndent = true;
} }
} }
} }

View File

@ -1,10 +1,13 @@
package com.knecon.fforesight.service.layoutparser.processor.services.visualization; package com.knecon.fforesight.service.layoutparser.processor.services.visualization;
import java.awt.Color;
import java.awt.geom.AffineTransform; import java.awt.geom.AffineTransform;
import java.awt.geom.Rectangle2D; import java.awt.geom.Rectangle2D;
import java.io.IOException; import java.io.IOException;
import java.io.OutputStream; import java.io.OutputStream;
import java.util.HashSet; import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set; import java.util.Set;
import org.apache.pdfbox.cos.COSDictionary; import org.apache.pdfbox.cos.COSDictionary;
@ -30,6 +33,7 @@ import com.knecon.fforesight.service.layoutparser.processor.model.visualization.
import com.knecon.fforesight.service.layoutparser.processor.model.visualization.LayoutGrid; import com.knecon.fforesight.service.layoutparser.processor.model.visualization.LayoutGrid;
import com.knecon.fforesight.service.layoutparser.processor.model.visualization.PlacedText; import com.knecon.fforesight.service.layoutparser.processor.model.visualization.PlacedText;
import com.knecon.fforesight.service.layoutparser.processor.model.visualization.VisualizationsOnPage; import com.knecon.fforesight.service.layoutparser.processor.model.visualization.VisualizationsOnPage;
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableExtractorCells;
import lombok.RequiredArgsConstructor; import lombok.RequiredArgsConstructor;
import lombok.SneakyThrows; import lombok.SneakyThrows;
@ -40,7 +44,6 @@ import lombok.extern.slf4j.Slf4j;
@RequiredArgsConstructor @RequiredArgsConstructor
public class ViewerDocumentService { public class ViewerDocumentService {
private static final String LAYER_NAME = "Layout grid"; private static final String LAYER_NAME = "Layout grid";
private static final int FONT_SIZE = 10; private static final int FONT_SIZE = 10;
public static final float LINE_WIDTH = 1f; public static final float LINE_WIDTH = 1f;
@ -49,13 +52,18 @@ public class ViewerDocumentService {
@SneakyThrows @SneakyThrows
public void createViewerDocument(PDDocument pdDocument, Document document, OutputStream outputStream, boolean layerVisibilityDefaultValue) { public void createViewerDocument(PDDocument pdDocument,
Document document,
OutputStream outputStream,
Map<Integer,List<TableExtractorCells>> extractedTableCells,
boolean layerVisibilityDefaultValue) {
LayoutGrid layoutGrid = layoutGridService.createLayoutGrid(document); LayoutGrid layoutGrid = layoutGridService.createLayoutGrid(document);
// PDDocument.save() is very slow, since it actually traverses the entire pdf and writes a new one. // PDDocument.save() is very slow, since it actually traverses the entire pdf and writes a new one.
// If we collect all COSDictionaries we changed and tell it explicitly to only add the changed ones by using saveIncremental it's very fast. // If we collect all COSDictionaries we changed and tell it explicitly to only add the changed ones by using saveIncremental it's very fast.
Set<COSDictionary> dictionariesToUpdate = new HashSet<>(); Set<COSDictionary> dictionariesToUpdate = new HashSet<>();
PDOptionalContentGroup layer = addLayerToDocument(pdDocument, dictionariesToUpdate, layerVisibilityDefaultValue); PDOptionalContentGroup layer = addLayerToDocument(pdDocument, dictionariesToUpdate, layerVisibilityDefaultValue);
PDOptionalContentGroup visualLayoutParsingLayer = addLayerToDocument(pdDocument, dictionariesToUpdate, true);
PDFont font = new PDType1Font(Standard14Fonts.FontName.HELVETICA); PDFont font = new PDType1Font(Standard14Fonts.FontName.HELVETICA);
for (int pageNumber = 0; pageNumber < pdDocument.getNumberOfPages(); pageNumber++) { for (int pageNumber = 0; pageNumber < pdDocument.getNumberOfPages(); pageNumber++) {
@ -72,44 +80,25 @@ public class ViewerDocumentService {
assert pageNumber == visualizationsOnPage.getPageNumber(); assert pageNumber == visualizationsOnPage.getPageNumber();
// We need to append to the content stream, otherwise the content could be overlapped by following content. // We need to append to the content stream, otherwise the content could be overlapped by following content.
try (var contentStream = new PDPageContentStream(pdDocument, pdPage, PDPageContentStream.AppendMode.APPEND, true)) { try (var contentStream = new PDPageContentStream(pdDocument, pdPage, PDPageContentStream.AppendMode.APPEND, true)) {
contentStream.beginMarkedContent(COSName.OC, visualLayoutParsingLayer);
contentStream.beginMarkedContent(COSName.OC, layer);
contentStream.saveGraphicsState(); contentStream.saveGraphicsState();
contentStream.setLineWidth(LINE_WIDTH); contentStream.setLineWidth(LINE_WIDTH);
for (ColoredLine coloredLine : visualizationsOnPage.getColoredLines()) { for (TableExtractorCells tableCells : extractedTableCells.get(pageNumber)) {
contentStream.setStrokingColor(coloredLine.color()); contentStream.setStrokingColor(new Color(0xB700FF));
contentStream.moveTo((float) coloredLine.line().getX1(), (float) coloredLine.line().getY1()); contentStream.addRect((float) tableCells.getX0(), (float) tableCells.getY0(), (float) tableCells.getWidth(), (float) tableCells.getHeight());
contentStream.lineTo((float) coloredLine.line().getX2(), (float) coloredLine.line().getY2());
contentStream.stroke(); contentStream.stroke();
}
for (ColoredRectangle coloredRectangle : visualizationsOnPage.getColoredRectangles()) {
contentStream.setStrokingColor(coloredRectangle.color());
Rectangle2D r = coloredRectangle.rectangle2D();
contentStream.addRect((float) r.getX(), (float) r.getY(), (float) r.getWidth(), (float) r.getHeight());
contentStream.stroke();
}
for (FilledRectangle filledRectangle : visualizationsOnPage.getFilledRectangles()) {
contentStream.setNonStrokingColor(filledRectangle.color());
PDExtendedGraphicsState graphicsState = new PDExtendedGraphicsState();
graphicsState.setNonStrokingAlphaConstant(filledRectangle.alpha());
contentStream.setGraphicsStateParameters(graphicsState);
Rectangle2D r = filledRectangle.rectangle2D();
contentStream.addRect((float) r.getX(), (float) r.getY(), (float) r.getWidth(), (float) r.getHeight());
contentStream.fill();
}
for (PlacedText placedText : visualizationsOnPage.getPlacedTexts()) {
contentStream.setFont(font, FONT_SIZE); contentStream.setFont(font, FONT_SIZE);
contentStream.beginText(); contentStream.beginText();
Matrix textMatrix = new Matrix((float) textDeRotationMatrix.getScaleX(), Matrix textMatrix = new Matrix((float) textDeRotationMatrix.getScaleX(),
(float) textDeRotationMatrix.getShearX(), (float) textDeRotationMatrix.getShearX(),
(float) textDeRotationMatrix.getShearY(), (float) textDeRotationMatrix.getShearY(),
(float) textDeRotationMatrix.getScaleY(), (float) textDeRotationMatrix.getScaleY(),
(float) placedText.lineStart().getX(), tableCells.getX0() ,
(float) placedText.lineStart().getY()); tableCells.getY0());
textMatrix.translate(-((font.getStringWidth(placedText.text()) / 1000) * FONT_SIZE + (2 * LINE_WIDTH) + 4), -FONT_SIZE); textMatrix.translate(-((font.getStringWidth(tableCells.getLabel()) / 1000) * FONT_SIZE + (2 * LINE_WIDTH) + 4), -FONT_SIZE);
contentStream.setTextMatrix(textMatrix); contentStream.setTextMatrix(textMatrix);
contentStream.showText(placedText.text()); contentStream.showText(tableCells.getLabel());
contentStream.endText(); contentStream.endText();
} }
contentStream.restoreGraphicsState(); contentStream.restoreGraphicsState();

View File

@ -1,12 +1,5 @@
package com.knecon.fforesight.service.layoutparser.processor.utils; package com.knecon.fforesight.service.layoutparser.processor.utils;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
import lombok.experimental.UtilityClass;
import org.apache.pdfbox.cos.COSName;
import org.apache.pdfbox.pdmodel.documentinterchange.markedcontent.PDMarkedContent;
import org.apache.pdfbox.text.TextPosition;
import java.awt.geom.Rectangle2D; import java.awt.geom.Rectangle2D;
import java.util.Collection; import java.util.Collection;
import java.util.Collections; import java.util.Collections;
@ -14,12 +7,22 @@ import java.util.List;
import java.util.Map; import java.util.Map;
import java.util.stream.Collectors; import java.util.stream.Collectors;
import org.apache.pdfbox.cos.COSName;
import org.apache.pdfbox.pdmodel.documentinterchange.markedcontent.PDMarkedContent;
import org.apache.pdfbox.text.TextPosition;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
import lombok.experimental.UtilityClass;
@UtilityClass @UtilityClass
public class MarkedContentUtils { public class MarkedContentUtils {
public static final String HEADER = "Header"; public static final String HEADER = "Header";
public static final String FOOTER = "Footer"; public static final String FOOTER = "Footer";
public List<Rectangle2D> getMarkedContentBboxPerLine(List<PDMarkedContent> markedContents, String subtype) { public List<Rectangle2D> getMarkedContentBboxPerLine(List<PDMarkedContent> markedContents, String subtype) {
if (markedContents == null) { if (markedContents == null) {
@ -31,7 +34,8 @@ public class MarkedContentUtils {
.filter(m -> m.getProperties() != null) .filter(m -> m.getProperties() != null)
.filter(m -> m.getProperties().getItem("Subtype") != null) .filter(m -> m.getProperties().getItem("Subtype") != null)
.filter(m -> ((COSName) m.getProperties().getItem("Subtype")).getName().equals(subtype)) .filter(m -> ((COSName) m.getProperties().getItem("Subtype")).getName().equals(subtype))
.map(PDMarkedContent::getContents).flatMap(Collection::stream) .map(PDMarkedContent::getContents)
.flatMap(Collection::stream)
.filter(t -> t instanceof TextPosition) .filter(t -> t instanceof TextPosition)
.map(t -> (TextPosition) t) .map(t -> (TextPosition) t)
.filter(t -> !t.getUnicode().equals(" ")) .filter(t -> !t.getUnicode().equals(" "))
@ -41,16 +45,19 @@ public class MarkedContentUtils {
return Collections.emptyList(); return Collections.emptyList();
} }
return markedContentByYPosition.values().stream() return markedContentByYPosition.values()
.map(textPositions -> new TextPositionSequence(textPositions.stream() .stream()
.toList(), 0, true) .map(textPositions -> new TextPositionSequence(textPositions.stream().toList(), 0, true).getRectangle())
.getRectangle()) .map(t -> new Rectangle2D.Float(t.getTopLeft().getX(), t.getTopLeft().getY() - Math.abs(t.getHeight()), t.getWidth(), Math.abs(t.getHeight())))
.map(t -> new Rectangle2D.Float(t.getTopLeft().getX(), t.getTopLeft().getY() - Math.abs(t.getHeight()), t.getWidth(), Math.abs(t.getHeight()))).collect(Collectors.toList()); .collect(Collectors.toList());
} }
public boolean intersects(TextPageBlock textBlock, Map<String, List<Rectangle2D>> markedContentBboxPerType, String type) { public boolean intersects(TextPageBlock textBlock, Map<String, List<Rectangle2D>> markedContentBboxPerType, String type) {
return markedContentBboxPerType.get(type) != null && markedContentBboxPerType.get(type).stream().anyMatch(rectangle -> rectangle.intersects(textBlock.getPdfMinX(), textBlock.getPdfMinY(), textBlock.getWidth(), textBlock.getHeight()));
return markedContentBboxPerType.get(type) != null && markedContentBboxPerType.get(type)
.stream()
.anyMatch(rectangle -> rectangle.intersects(textBlock.getPdfMinX(), textBlock.getPdfMinY(), textBlock.getWidth(), textBlock.getHeight()));
} }
} }

View File

@ -19,10 +19,9 @@ public final class PositionUtils {
double threshold = textBlock.getMostPopularWordHeight() * 3; double threshold = textBlock.getMostPopularWordHeight() * 3;
if (textBlock.getPdfMinX() + threshold > btf.getTopLeft().getX() if (textBlock.getPdfMinX() + threshold > btf.getTopLeft().getX() && textBlock.getPdfMaxX() - threshold < btf.getTopLeft()
&& textBlock.getPdfMaxX() - threshold < btf.getTopLeft().getX() + btf.getWidth() .getX() + btf.getWidth() && textBlock.getPdfMinY() + threshold > btf.getTopLeft().getY() && textBlock.getPdfMaxY() - threshold < btf.getTopLeft()
&& textBlock.getPdfMinY() + threshold > btf.getTopLeft().getY() .getY() + btf.getHeight()) {
&& textBlock.getPdfMaxY() - threshold < btf.getTopLeft().getY() + btf.getHeight()) {
return true; return true;
} else { } else {
return false; return false;

View File

@ -41,11 +41,14 @@ public class RectangleTransformations {
return atomicTextBlocks.stream().flatMap(atomicTextBlock -> atomicTextBlock.getPositions().stream()).collect(new Rectangle2DBBoxCollector()); return atomicTextBlocks.stream().flatMap(atomicTextBlock -> atomicTextBlock.getPositions().stream()).collect(new Rectangle2DBBoxCollector());
} }
public static Collector<Rectangle2D, Rectangle2DBBoxCollector.BBox, Rectangle2D> collectBBox() { public static Collector<Rectangle2D, Rectangle2DBBoxCollector.BBox, Rectangle2D> collectBBox() {
return new Rectangle2DBBoxCollector(); return new Rectangle2DBBoxCollector();
} }
public static PDRectangle toPDRectangleBBox(List<Rectangle> rectangles) { public static PDRectangle toPDRectangleBBox(List<Rectangle> rectangles) {
Rectangle2D rectangle2D = RectangleTransformations.rectangleBBox(rectangles); Rectangle2D rectangle2D = RectangleTransformations.rectangleBBox(rectangles);
@ -70,6 +73,7 @@ public class RectangleTransformations {
return format("%f,%f,%f,%f", rectangle2D.getX(), rectangle2D.getY(), rectangle2D.getWidth(), rectangle2D.getHeight()); return format("%f,%f,%f,%f", rectangle2D.getX(), rectangle2D.getY(), rectangle2D.getWidth(), rectangle2D.getHeight());
} }
public static Rectangle2D rectangleBBox(List<Rectangle> rectangles) { public static Rectangle2D rectangleBBox(List<Rectangle> rectangles) {
return rectangles.stream().map(RectangleTransformations::toRectangle2D).collect(new Rectangle2DBBoxCollector()); return rectangles.stream().map(RectangleTransformations::toRectangle2D).collect(new Rectangle2DBBoxCollector());
@ -84,6 +88,7 @@ public class RectangleTransformations {
-redactionLogRectangle.getHeight()); -redactionLogRectangle.getHeight());
} }
public static Rectangle2D toRectangle2D(PDRectangle rectangle) { public static Rectangle2D toRectangle2D(PDRectangle rectangle) {
return new Rectangle2D.Double(rectangle.getLowerLeftX(), rectangle.getLowerLeftY(), rectangle.getWidth(), rectangle.getHeight()); return new Rectangle2D.Double(rectangle.getLowerLeftX(), rectangle.getLowerLeftY(), rectangle.getWidth(), rectangle.getHeight());

View File

@ -3,7 +3,6 @@ package com.knecon.fforesight.service.layoutparser.processor.utils;
import java.util.List; import java.util.List;
import java.util.stream.Collectors; import java.util.stream.Collectors;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock; import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence; import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;

View File

@ -28,15 +28,13 @@ import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPosit
* *
* @author Ben Litchfield * @author Ben Litchfield
*/ */
public class TextPositionSequenceComparator implements Comparator<TextPositionSequence> public class TextPositionSequenceComparator implements Comparator<TextPositionSequence> {
{
@Override @Override
public int compare(TextPositionSequence pos1, TextPositionSequence pos2) public int compare(TextPositionSequence pos1, TextPositionSequence pos2) {
{
// only compare text that is in the same direction // only compare text that is in the same direction
int cmp1 = Float.compare(pos1.getDir().getDegrees(), pos2.getDir().getDegrees()); int cmp1 = Float.compare(pos1.getDir().getDegrees(), pos2.getDir().getDegrees());
if (cmp1 != 0) if (cmp1 != 0) {
{
return cmp1; return cmp1;
} }
@ -54,19 +52,13 @@ public class TextPositionSequenceComparator implements Comparator<TextPositionSe
float yDifference = Math.abs(pos1YBottom - pos2YBottom); float yDifference = Math.abs(pos1YBottom - pos2YBottom);
// we will do a simple tolerance comparison // we will do a simple tolerance comparison
if (yDifference < .1 || if (yDifference < .1 || pos2YBottom >= pos1YTop && pos2YBottom <= pos1YBottom || pos1YBottom >= pos2YTop && pos1YBottom <= pos2YBottom) {
pos2YBottom >= pos1YTop && pos2YBottom <= pos1YBottom ||
pos1YBottom >= pos2YTop && pos1YBottom <= pos2YBottom)
{
return Float.compare(x1, x2); return Float.compare(x1, x2);
} } else if (pos1YBottom < pos2YBottom) {
else if (pos1YBottom < pos2YBottom)
{
return -1; return -1;
} } else {
else
{
return 1; return 1;
} }
} }
} }

View File

@ -14,7 +14,7 @@ import com.knecon.fforesight.service.layoutparser.server.queue.MessagingConfigur
import com.knecon.fforesight.tenantcommons.MultiTenancyAutoConfiguration; import com.knecon.fforesight.tenantcommons.MultiTenancyAutoConfiguration;
@ImportAutoConfiguration({MultiTenancyAutoConfiguration.class}) @ImportAutoConfiguration({MultiTenancyAutoConfiguration.class})
@Import({MetricsConfiguration.class, StorageAutoConfiguration.class, LayoutParsingServiceProcessorConfiguration.class, MessagingConfiguration.class}) @Import({MetricsConfiguration.class, StorageAutoConfiguration.class, LayoutParsingServiceProcessorConfiguration.class, MessagingConfiguration.class})
@SpringBootApplication(exclude = {SecurityAutoConfiguration.class, ManagementWebSecurityAutoConfiguration.class}) @SpringBootApplication(exclude = {SecurityAutoConfiguration.class, ManagementWebSecurityAutoConfiguration.class})
public class Application { public class Application {

View File

@ -26,6 +26,7 @@ import com.knecon.fforesight.service.layoutparser.processor.LayoutParsingPipelin
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document; import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlock; import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlock;
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.image.ImageServiceResponse; import com.knecon.fforesight.service.layoutparser.processor.python_api.model.image.ImageServiceResponse;
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableExtractorResponse;
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableServiceResponse; import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableServiceResponse;
import com.knecon.fforesight.service.layoutparser.processor.services.factory.DocumentGraphFactory; import com.knecon.fforesight.service.layoutparser.processor.services.factory.DocumentGraphFactory;
import com.knecon.fforesight.service.layoutparser.processor.services.mapper.TaasDocumentDataMapper; import com.knecon.fforesight.service.layoutparser.processor.services.mapper.TaasDocumentDataMapper;
@ -50,7 +51,7 @@ public class BdrJsonBuildTest extends AbstractTest {
return DocumentGraphFactory.buildDocumentGraph(layoutParsingPipeline.parseLayout(LayoutParsingType.TAAS, return DocumentGraphFactory.buildDocumentGraph(layoutParsingPipeline.parseLayout(LayoutParsingType.TAAS,
pdDocument, pdDocument,
new ImageServiceResponse(), new ImageServiceResponse(),
new TableServiceResponse())); new TableServiceResponse(), new TableExtractorResponse()));
} }
} }

View File

@ -33,6 +33,7 @@ import com.knecon.fforesight.service.layoutparser.processor.LayoutParsingPipelin
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document; import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.SemanticNode; import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.SemanticNode;
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.image.ImageServiceResponse; import com.knecon.fforesight.service.layoutparser.processor.python_api.model.image.ImageServiceResponse;
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableExtractorResponse;
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableServiceResponse; import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableServiceResponse;
import com.knecon.fforesight.service.layoutparser.processor.services.factory.DocumentGraphFactory; import com.knecon.fforesight.service.layoutparser.processor.services.factory.DocumentGraphFactory;
import com.knecon.fforesight.tenantcommons.TenantsClient; import com.knecon.fforesight.tenantcommons.TenantsClient;
@ -98,7 +99,7 @@ public class HeadlinesGoldStandardIntegrationTest {
Document documentGraph = DocumentGraphFactory.buildDocumentGraph(layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER, Document documentGraph = DocumentGraphFactory.buildDocumentGraph(layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER,
Loader.loadPDF(pdfFileResource.getFile()), Loader.loadPDF(pdfFileResource.getFile()),
new ImageServiceResponse(), new ImageServiceResponse(),
new TableServiceResponse())); new TableServiceResponse(), new TableExtractorResponse()));
var foundHeadlines = documentGraph.streamAllSubNodes() var foundHeadlines = documentGraph.streamAllSubNodes()
.map(SemanticNode::getHeadline) .map(SemanticNode::getHeadline)

View File

@ -16,6 +16,7 @@ import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.Do
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType; import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document; import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document;
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.image.ImageServiceResponse; import com.knecon.fforesight.service.layoutparser.processor.python_api.model.image.ImageServiceResponse;
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableExtractorResponse;
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableServiceResponse; import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableServiceResponse;
import com.knecon.fforesight.service.layoutparser.processor.services.factory.DocumentGraphFactory; import com.knecon.fforesight.service.layoutparser.processor.services.factory.DocumentGraphFactory;
import com.knecon.fforesight.service.layoutparser.processor.services.mapper.DocumentDataMapper; import com.knecon.fforesight.service.layoutparser.processor.services.mapper.DocumentDataMapper;
@ -58,7 +59,7 @@ public class DocumentGraphJsonWritingTest extends BuildDocumentTest {
Document documentGraph = DocumentGraphFactory.buildDocumentGraph(layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER, Document documentGraph = DocumentGraphFactory.buildDocumentGraph(layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER,
Loader.loadPDF(filename.toFile()), Loader.loadPDF(filename.toFile()),
new ImageServiceResponse(), new ImageServiceResponse(),
new TableServiceResponse())); new TableServiceResponse(), new TableExtractorResponse()));
DocumentData documentData = DocumentDataMapper.toDocumentData(documentGraph); DocumentData documentData = DocumentDataMapper.toDocumentData(documentGraph);
ObjectMapper mapper = ObjectMapperFactory.create(); ObjectMapper mapper = ObjectMapperFactory.create();

View File

@ -1,27 +1,53 @@
package com.knecon.fforesight.service.layoutparser.server.graph; package com.knecon.fforesight.service.layoutparser.server.graph;
import static com.knecon.fforesight.service.layoutparser.server.utils.visualizations.PdfDraw.drawRectangle2DList;
import java.awt.Color;
import java.awt.geom.Rectangle2D;
import java.awt.geom.RectangularShape;
import java.io.File;
import java.io.FileOutputStream; import java.io.FileOutputStream;
import java.io.InputStream;
import java.nio.file.Files;
import java.nio.file.Path; import java.nio.file.Path;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.LinkedList;
import java.util.List; import java.util.List;
import java.util.Map;
import org.apache.pdfbox.Loader; import org.apache.pdfbox.Loader;
import org.apache.pdfbox.pdmodel.PDDocument; import org.apache.pdfbox.pdmodel.PDDocument;
import org.json.JSONArray;
import org.json.JSONObject;
import org.junit.jupiter.api.Disabled; import org.junit.jupiter.api.Disabled;
import org.junit.jupiter.api.Test; import org.junit.jupiter.api.Test;
import org.springframework.beans.factory.annotation.Autowired; import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.core.io.ClassPathResource; import org.springframework.core.io.ClassPathResource;
import com.fasterxml.jackson.databind.JsonNode;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentData; import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentData;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentStructure; import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.DocumentStructure;
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeType; import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeType;
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType; import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType;
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationDocument; import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationDocument;
import com.knecon.fforesight.service.layoutparser.processor.model.GapInformation;
import com.knecon.fforesight.service.layoutparser.processor.model.LineInformation;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document; import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Table; import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Table;
import com.knecon.fforesight.service.layoutparser.processor.model.table.Cell; import com.knecon.fforesight.service.layoutparser.processor.model.table.Cell;
import com.knecon.fforesight.service.layoutparser.processor.model.table.Rectangle;
import com.knecon.fforesight.service.layoutparser.processor.model.table.TablePageBlock; import com.knecon.fforesight.service.layoutparser.processor.model.table.TablePageBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.image.ImageServiceResponse; import com.knecon.fforesight.service.layoutparser.processor.python_api.model.image.ImageServiceResponse;
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.ExtractedTable;
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableCells;
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableExtractorResponse;
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableServiceResponse; import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableServiceResponse;
import com.knecon.fforesight.service.layoutparser.processor.services.GapDetectionService;
import com.knecon.fforesight.service.layoutparser.processor.services.GapsAcrossLinesService;
import com.knecon.fforesight.service.layoutparser.processor.services.LineDetectionService;
import com.knecon.fforesight.service.layoutparser.processor.services.SectionsBuilderService; import com.knecon.fforesight.service.layoutparser.processor.services.SectionsBuilderService;
import com.knecon.fforesight.service.layoutparser.processor.services.classification.RedactManagerClassificationService; import com.knecon.fforesight.service.layoutparser.processor.services.classification.RedactManagerClassificationService;
import com.knecon.fforesight.service.layoutparser.processor.services.factory.DocumentGraphFactory; import com.knecon.fforesight.service.layoutparser.processor.services.factory.DocumentGraphFactory;
@ -29,7 +55,9 @@ import com.knecon.fforesight.service.layoutparser.processor.services.mapper.Docu
import com.knecon.fforesight.service.layoutparser.processor.services.mapper.PropertiesMapper; import com.knecon.fforesight.service.layoutparser.processor.services.mapper.PropertiesMapper;
import com.knecon.fforesight.service.layoutparser.processor.services.visualization.LayoutGridService; import com.knecon.fforesight.service.layoutparser.processor.services.visualization.LayoutGridService;
import com.knecon.fforesight.service.layoutparser.processor.services.visualization.ViewerDocumentService; import com.knecon.fforesight.service.layoutparser.processor.services.visualization.ViewerDocumentService;
import com.knecon.fforesight.service.layoutparser.processor.utils.PdfVisualisationUtility;
import com.knecon.fforesight.service.layoutparser.server.utils.BuildDocumentTest; import com.knecon.fforesight.service.layoutparser.server.utils.BuildDocumentTest;
import com.knecon.fforesight.service.layoutparser.server.utils.visualizations.PdfDraw;
import lombok.SneakyThrows; import lombok.SneakyThrows;
@ -41,6 +69,9 @@ public class ViewerDocumentTest extends BuildDocumentTest {
@Autowired @Autowired
private RedactManagerClassificationService redactManagerClassificationService; private RedactManagerClassificationService redactManagerClassificationService;
@Autowired
private ObjectMapper objectMapper;
@Test @Test
@SneakyThrows @SneakyThrows
public void testViewerDocument() { public void testViewerDocument() {
@ -51,16 +82,69 @@ public class ViewerDocumentTest extends BuildDocumentTest {
ViewerDocumentService viewerDocumentService = new ViewerDocumentService(layoutGridService); ViewerDocumentService viewerDocumentService = new ViewerDocumentService(layoutGridService);
Document document = buildGraph(fileName, LayoutParsingType.TAAS); Document document = buildGraph(fileName, LayoutParsingType.TAAS);
try (var pdDocument = Loader.loadPDF(new ClassPathResource(fileName).getFile()); var out = new FileOutputStream(tmpFileName)) { try (var pdDocument = Loader.loadPDF(new ClassPathResource(fileName).getFile()); var out = new FileOutputStream(tmpFileName)) {
viewerDocumentService.createViewerDocument(pdDocument, document, out, true); viewerDocumentService.createViewerDocument(pdDocument, document, out, null,true);
} }
} }
@Test
@SneakyThrows
public void testLayoutParsingServiceResults() {
String tableSourceFileName ="C:\\Users\\YannikHampe\\Downloads\\b28d9a22b674906813f12b86dda33202.EXTRACTED_TABLES.json\\b28d9a22b674906813f12b86dda33202.EXTRACTED_TABLES.json";
Path pdfFileResource = Path.of("C:\\Users\\YannikHampe\\Downloads\\2009-1048395_50pages_tables.pdf");
String tmpFileName = "C:/Users/YANNIK~1/AppData/Local/Temp/rectangles."+pdfFileResource.getFileName();
PDDocument pdDocument = Loader.loadPDF(pdfFileResource.toFile());
try (InputStream inputStream = Files.newInputStream(Path.of(tableSourceFileName))) {
TableExtractorResponse tableExtractorResponse = objectMapper.readValue(inputStream, TableExtractorResponse.class);
tableExtractorResponse.getData().forEach(data -> {
List<TableCells> tableCells = convertTableCells(data.getTables());
});
inputStream.close();
}
try (var out = new FileOutputStream(tmpFileName)) {
pdDocument.save(out);
pdDocument.close();
}
}
public List<TableCells> convertTableCells(List<ExtractedTable> tableObjects) {
List<TableCells> parsedTableCells = new ArrayList<>();
tableObjects.stream().forEach(t -> {
System.out.println(t.getTable().getLabel());
TableCells tableCells = new TableCells();
tableCells.setX0(t.getTable().getBbox().get(0));
tableCells.setX1(t.getTable().getBbox().get(2));
tableCells.setY0(t.getTable().getBbox().get(1));
tableCells.setY1(t.getTable().getBbox().get(3));
tableCells.setWidth(tableCells.getX1()- tableCells.getX0());
tableCells.setHeight(tableCells.getY1()- tableCells.getY0());
parsedTableCells.add(tableCells);
t.getObjects().forEach(o -> {
System.out.println(o.getLabel());
TableCells objectCell = new TableCells();
objectCell.setX0(t.getTable().getBbox().get(0));
objectCell.setX1(t.getTable().getBbox().get(2));
objectCell.setY0(t.getTable().getBbox().get(1));
objectCell.setY1(t.getTable().getBbox().get(3));
objectCell.setWidth(objectCell.getX1()- objectCell.getX0());
objectCell.setHeight(objectCell.getY1()- objectCell.getY0());
parsedTableCells.add(objectCell);
});
});
return parsedTableCells;
}
public ClassificationDocument buildClassificationDocument(PDDocument originDocument) { public ClassificationDocument buildClassificationDocument(PDDocument originDocument) {
ClassificationDocument classificationDocument = layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER, ClassificationDocument classificationDocument = layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER,
originDocument, originDocument,
new ImageServiceResponse(), new ImageServiceResponse(),
new TableServiceResponse()); new TableServiceResponse(), new TableExtractorResponse());
redactManagerClassificationService.classifyDocument(classificationDocument); redactManagerClassificationService.classifyDocument(classificationDocument);

View File

@ -33,6 +33,7 @@ import com.knecon.fforesight.service.layoutparser.processor.model.table.TablePag
import com.knecon.fforesight.service.layoutparser.processor.python_api.adapter.CvTableParsingAdapter; import com.knecon.fforesight.service.layoutparser.processor.python_api.adapter.CvTableParsingAdapter;
import com.knecon.fforesight.service.layoutparser.processor.python_api.adapter.ImageServiceResponseAdapter; import com.knecon.fforesight.service.layoutparser.processor.python_api.adapter.ImageServiceResponseAdapter;
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.image.ImageServiceResponse; import com.knecon.fforesight.service.layoutparser.processor.python_api.model.image.ImageServiceResponse;
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableExtractorResponse;
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableServiceResponse; import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableServiceResponse;
import com.knecon.fforesight.service.layoutparser.processor.services.SectionsBuilderService; import com.knecon.fforesight.service.layoutparser.processor.services.SectionsBuilderService;
import com.knecon.fforesight.service.layoutparser.processor.services.classification.RedactManagerClassificationService; import com.knecon.fforesight.service.layoutparser.processor.services.classification.RedactManagerClassificationService;
@ -67,7 +68,7 @@ public class PdfSegmentationServiceTest extends AbstractTest {
ClassificationDocument classificationDocument = layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER, ClassificationDocument classificationDocument = layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER,
originDocument, originDocument,
new ImageServiceResponse(), new ImageServiceResponse(),
tableServiceResponse); tableServiceResponse, new TableExtractorResponse());
redactManagerClassificationService.classifyDocument(classificationDocument); redactManagerClassificationService.classifyDocument(classificationDocument);

View File

@ -20,6 +20,7 @@ import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Do
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Table; import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Table;
import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings; import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings;
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.image.ImageServiceResponse; import com.knecon.fforesight.service.layoutparser.processor.python_api.model.image.ImageServiceResponse;
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableExtractorResponse;
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableServiceResponse; import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableServiceResponse;
import com.knecon.fforesight.service.layoutparser.processor.services.PageContentExtractor; import com.knecon.fforesight.service.layoutparser.processor.services.PageContentExtractor;
import com.knecon.fforesight.service.layoutparser.processor.services.RulingCleaningService; import com.knecon.fforesight.service.layoutparser.processor.services.RulingCleaningService;
@ -79,11 +80,11 @@ public class RulingCleaningServiceTest extends BuildDocumentTest {
Document documentGraphBefore = DocumentGraphFactory.buildDocumentGraph(layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER, Document documentGraphBefore = DocumentGraphFactory.buildDocumentGraph(layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER,
Loader.loadPDF(filename.toFile()), Loader.loadPDF(filename.toFile()),
new ImageServiceResponse(), new ImageServiceResponse(),
new TableServiceResponse())); new TableServiceResponse(), new TableExtractorResponse()));
Document documentGraphAfter = DocumentGraphFactory.buildDocumentGraph(layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER, Document documentGraphAfter = DocumentGraphFactory.buildDocumentGraph(layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER,
Loader.loadPDF(filename.toFile()), Loader.loadPDF(filename.toFile()),
new ImageServiceResponse(), new ImageServiceResponse(),
new TableServiceResponse())); new TableServiceResponse(), new TableExtractorResponse()));
DocumentData documentDataBefore = DocumentDataMapper.toDocumentData(documentGraphBefore); DocumentData documentDataBefore = DocumentDataMapper.toDocumentData(documentGraphBefore);
DocumentData documentDataAfter = DocumentDataMapper.toDocumentData(documentGraphAfter); DocumentData documentDataAfter = DocumentDataMapper.toDocumentData(documentGraphAfter);
if (!compareStructures(documentDataBefore.getDocumentStructure(), documentDataAfter.getDocumentStructure(), filename.getFileName().toString())) { if (!compareStructures(documentDataBefore.getDocumentStructure(), documentDataAfter.getDocumentStructure(), filename.getFileName().toString())) {

View File

@ -26,6 +26,8 @@ import org.springframework.test.context.junit.jupiter.SpringExtension;
import java.io.InputStream; import java.io.InputStream;
import java.util.Optional; import java.util.Optional;
import javax.swing.text.html.Option;
@ExtendWith(SpringExtension.class) @ExtendWith(SpringExtension.class)
@SpringBootTest(classes = Application.class, webEnvironment = SpringBootTest.WebEnvironment.RANDOM_PORT) @SpringBootTest(classes = Application.class, webEnvironment = SpringBootTest.WebEnvironment.RANDOM_PORT)
@Import(AbstractTest.TestConfiguration.class) @Import(AbstractTest.TestConfiguration.class)
@ -46,6 +48,8 @@ public abstract class AbstractTest {
protected final static String ORIGIN_FILE_ID = "origin"; protected final static String ORIGIN_FILE_ID = "origin";
protected final static String TABLE_FILE_ID = "table"; protected final static String TABLE_FILE_ID = "table";
protected final static String IMAGE_FILE_ID = "image"; protected final static String IMAGE_FILE_ID = "image";
protected final static String TABLE_EXTRACTOR_FILE_ID = "extractedTable";
protected final static String STRUCTURE_FILE_ID = "structure"; protected final static String STRUCTURE_FILE_ID = "structure";
protected final static String TEXT_FILE_ID = "texts"; protected final static String TEXT_FILE_ID = "texts";
protected final static String POSITION_FILE_ID = "positions"; protected final static String POSITION_FILE_ID = "positions";
@ -62,6 +66,7 @@ public abstract class AbstractTest {
.originFileStorageId(ORIGIN_FILE_ID) .originFileStorageId(ORIGIN_FILE_ID)
.tablesFileStorageId(Optional.of(TABLE_FILE_ID)) .tablesFileStorageId(Optional.of(TABLE_FILE_ID))
.imagesFileStorageId(Optional.of(IMAGE_FILE_ID)) .imagesFileStorageId(Optional.of(IMAGE_FILE_ID))
.tableExtractorFileId(Optional.of(TABLE_EXTRACTOR_FILE_ID))
.structureFileStorageId(STRUCTURE_FILE_ID) .structureFileStorageId(STRUCTURE_FILE_ID)
.textBlockFileStorageId(TEXT_FILE_ID) .textBlockFileStorageId(TEXT_FILE_ID)
.positionBlockFileStorageId(POSITION_FILE_ID) .positionBlockFileStorageId(POSITION_FILE_ID)
@ -89,7 +94,7 @@ public abstract class AbstractTest {
@SneakyThrows @SneakyThrows
protected LayoutParsingRequest prepareStorage(String file) { protected LayoutParsingRequest prepareStorage(String file) {
return prepareStorage(file, "cv_table_parsing_response/empty.json", "image_service_response/empty.json"); return prepareStorage(file, "cv_table_parsing_response/empty.json", "image_service_response/empty.json","table_extractor_response/empty.json");
} }
@ -107,6 +112,7 @@ public abstract class AbstractTest {
.originFileStorageId(ORIGIN_FILE_ID) .originFileStorageId(ORIGIN_FILE_ID)
.tablesFileStorageId(Optional.of(TABLE_FILE_ID)) .tablesFileStorageId(Optional.of(TABLE_FILE_ID))
.imagesFileStorageId(Optional.of(IMAGE_FILE_ID)) .imagesFileStorageId(Optional.of(IMAGE_FILE_ID))
.tableExtractorFileId(Optional.of(TABLE_EXTRACTOR_FILE_ID))
.structureFileStorageId(STRUCTURE_FILE_ID) .structureFileStorageId(STRUCTURE_FILE_ID)
.textBlockFileStorageId(TEXT_FILE_ID) .textBlockFileStorageId(TEXT_FILE_ID)
.positionBlockFileStorageId(POSITION_FILE_ID) .positionBlockFileStorageId(POSITION_FILE_ID)
@ -117,21 +123,23 @@ public abstract class AbstractTest {
} }
@SneakyThrows @SneakyThrows
protected LayoutParsingRequest prepareStorage(String file, String cvServiceResponseFile, String imageInfoFile) { protected LayoutParsingRequest prepareStorage(String file, String cvServiceResponseFile, String imageInfoFile, String tableExtractorResponseFile) {
ClassPathResource pdfFileResource = new ClassPathResource(file); ClassPathResource pdfFileResource = new ClassPathResource(file);
ClassPathResource cvServiceResponseFileResource = new ClassPathResource(cvServiceResponseFile); ClassPathResource cvServiceResponseFileResource = new ClassPathResource(cvServiceResponseFile);
ClassPathResource imageInfoFileResource = new ClassPathResource(imageInfoFile); ClassPathResource imageInfoFileResource = new ClassPathResource(imageInfoFile);
ClassPathResource tableExtractorResponseFileRessource = new ClassPathResource(tableExtractorResponseFile);
return prepareStorage(pdfFileResource.getInputStream(), cvServiceResponseFileResource.getInputStream(), imageInfoFileResource.getInputStream()); return prepareStorage(pdfFileResource.getInputStream(), cvServiceResponseFileResource.getInputStream(), imageInfoFileResource.getInputStream(),tableExtractorResponseFileRessource.getInputStream());
} }
@SneakyThrows @SneakyThrows
protected LayoutParsingRequest prepareStorage(InputStream fileStream, InputStream cvServiceResponseFileStream, InputStream imageInfoStream) { protected LayoutParsingRequest prepareStorage(InputStream fileStream, InputStream cvServiceResponseFileStream, InputStream imageInfoStream, InputStream tableExtractorResponseFileStream) {
storageService.storeObject(TenantContext.getTenantId(), IMAGE_FILE_ID, imageInfoStream); storageService.storeObject(TenantContext.getTenantId(), IMAGE_FILE_ID, imageInfoStream);
storageService.storeObject(TenantContext.getTenantId(), TABLE_FILE_ID, cvServiceResponseFileStream); storageService.storeObject(TenantContext.getTenantId(), TABLE_FILE_ID, cvServiceResponseFileStream);
storageService.storeObject(TenantContext.getTenantId(), TABLE_EXTRACTOR_FILE_ID, tableExtractorResponseFileStream);
storageService.storeObject(TenantContext.getTenantId(), ORIGIN_FILE_ID, fileStream); storageService.storeObject(TenantContext.getTenantId(), ORIGIN_FILE_ID, fileStream);
return buildDefaultLayoutParsingRequest(LayoutParsingType.REDACT_MANAGER); return buildDefaultLayoutParsingRequest(LayoutParsingType.REDACT_MANAGER);

View File

@ -11,6 +11,7 @@ import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsi
import com.knecon.fforesight.service.layoutparser.processor.LayoutParsingPipeline; import com.knecon.fforesight.service.layoutparser.processor.LayoutParsingPipeline;
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationDocument; import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationDocument;
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document; import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document;
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableExtractorResponse;
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableServiceResponse; import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableServiceResponse;
import com.knecon.fforesight.service.layoutparser.processor.services.factory.DocumentGraphFactory; import com.knecon.fforesight.service.layoutparser.processor.services.factory.DocumentGraphFactory;
@ -28,7 +29,7 @@ public abstract class BuildDocumentTest extends AbstractTest {
ClassPathResource fileResource = new ClassPathResource(filename); ClassPathResource fileResource = new ClassPathResource(filename);
prepareStorage(filename); prepareStorage(filename);
try (InputStream inputStream = fileResource.getInputStream(); PDDocument pdDocument = Loader.loadPDF(inputStream.readAllBytes())) { try (InputStream inputStream = fileResource.getInputStream(); PDDocument pdDocument = Loader.loadPDF(inputStream.readAllBytes())) {
return layoutParsingPipeline.parseLayout(layoutParsingType, pdDocument, layoutParsingStorageService.getImagesFile(IMAGE_FILE_ID), new TableServiceResponse()); return layoutParsingPipeline.parseLayout(layoutParsingType, pdDocument, layoutParsingStorageService.getImagesFile(IMAGE_FILE_ID), new TableServiceResponse(), new TableExtractorResponse());
} }
} }
@ -44,7 +45,7 @@ public abstract class BuildDocumentTest extends AbstractTest {
protected Document buildGraph(String filename, LayoutParsingType layoutParsingType) { protected Document buildGraph(String filename, LayoutParsingType layoutParsingType) {
if (filename.equals("files/S-Metolachlor_RAR_01_Volume_1_2018-09-06.pdf")) { if (filename.equals("files/S-Metolachlor_RAR_01_Volume_1_2018-09-06.pdf")) {
prepareStorage(filename, "cv_table_parsing_response/empty.json", "image_service_response/S-Metolachlor_RAR_01_Volume_1_2018-09-06.IMAGE_INFO.json"); prepareStorage(filename, "cv_table_parsing_response/empty.json", "image_service_response/S-Metolachlor_RAR_01_Volume_1_2018-09-06.IMAGE_INFO.json","table_extractor_response/empty.json");
} else { } else {
prepareStorage(filename); prepareStorage(filename);
} }

View File

@ -0,0 +1,8 @@
{
"dossierId": "123",
"fileId": "123",
"operation": "table",
"targetFileExtension": "ORIGIN.pdf.gz",
"responseFileExtension": "TABLES.json.gz",
"extractedTableData": []
}

View File

@ -4,5 +4,5 @@ gradle assemble
buildNumber=${1:-1} buildNumber=${1:-1}
gradle bootBuildImage --cleanCache --publishImage -PbuildbootDockerHostNetwork=true -Pversion=$USER-$buildNumber gradle bootBuildImage --cleanCache --publishImage -Pversion=layout-parser-yannik-$buildNumber --stacktrace
echo "nexus.knecon.com:5001/red/${dir}-server-v1:$USER-$buildNumber" echo "nexus.knecon.com:5001/red/${dir}-server-v1:table-extractor-yannik-$buildNumber"