RED-8670: add table detection from idp result

* some 'slight' refactoring
This commit is contained in:
Kilian Schuettler 2024-12-18 15:07:31 +01:00
parent b5152112ee
commit 3a700aecd4
105 changed files with 2593 additions and 1726 deletions

View File

@ -10,38 +10,23 @@ import lombok.NonNull;
@Builder
@Schema(description = "Object containing all storage paths the service needs to know.")
public record LayoutParsingRequest(
@Schema(description = "Enum specifying the type of layout parsing to be performed.", allowableValues = "{RedactManager, DocuMine, TAAS}")//
@NonNull LayoutParsingType layoutParsingType,
@Schema(description = "Enum specifying the type of layout parsing to be performed.", allowableValues = "{RedactManager, DocuMine, TAAS}") @NonNull LayoutParsingType layoutParsingType,
@Schema(description = "General purpose identifiers. They are not changed by the service at all and are returned as is in the response queue.") Map<String, String> identifier,
@Schema(description = "Path to the original PDF file.") @NonNull String originFileStorageId,
@Schema(description = "General purpose identifiers. They are not changed by the service at all and are returned as is in the response queue.")//
Map<String, String> identifier,
@Schema(description = "Optional Path to the table extraction file.") Optional<String> tablesFileStorageId,
@Schema(description = "Optional Path to the image classification file.") Optional<String> imagesFileStorageId,
@Schema(description = "Path where the IDP Result File is stored.") Optional<String> idpResultStorageId,
@Schema(description = "Optional Path to the the visual layout parsing service file") Optional<String> visualLayoutParsingFileId,
@Schema(description = "Path to the original PDF file.")//
@NonNull String originFileStorageId,//
@Schema(description = "Optional Path to the table extraction file.")//
Optional<String> tablesFileStorageId,//
@Schema(description = "Optional Path to the image classification file.")//
Optional<String> imagesFileStorageId,//
@Schema(description = "Optional Path to the the visual layout parsing service file") Optional<String> visualLayoutParsingFileId,//
@Schema(description = "Path where the Document Structure File will be stored.")//
@NonNull String structureFileStorageId,//
@Schema(description = "Path where the Research Data File will be stored.")//
String researchDocumentStorageId,//
@Schema(description = "Path where the Document Text File will be stored.")//
@NonNull String textBlockFileStorageId,//
@Schema(description = "Path where the Document Positions File will be stored.")//
@NonNull String positionBlockFileStorageId,//
@Schema(description = "Path where the Document Pages File will be stored.")//
@NonNull String pageFileStorageId,//
@Schema(description = "Path where the Document Markdown File will be stored.")//
Optional<String> documentMarkdownFileStorageId,//
@Schema(description = "Path where the Simplified Text File will be stored.")//
@NonNull String simplifiedTextStorageId,//
@Schema(description = "Path where the Viewer Document PDF will be stored.")//
@NonNull String viewerDocumentStorageId
@Schema(description = "Path where the Document Structure File will be stored.") @NonNull String structureFileStorageId,
@Schema(description = "Path where the Research Data File will be stored.") String researchDocumentStorageId,
@Schema(description = "Path where the Document Text File will be stored.") @NonNull String textBlockFileStorageId,
@Schema(description = "Path where the Document Positions File will be stored.") @NonNull String positionBlockFileStorageId,
@Schema(description = "Path where the Document Pages File will be stored.") @NonNull String pageFileStorageId,
@Schema(description = "Path where the Document Markdown File will be stored.") Optional<String> documentMarkdownFileStorageId,
@Schema(description = "Path where the Simplified Text File will be stored.") @NonNull String simplifiedTextStorageId,
@Schema(description = "Path where the Viewer Document PDF will be stored.") @NonNull String viewerDocumentStorageId
) {
}

View File

@ -23,6 +23,8 @@ dependencies {
}
implementation("com.iqser.red.commons:storage-commons:2.50.0")
api("com.knecon.fforesight:azure-ocr-service-api:0.23.0")
implementation("org.apache.pdfbox:pdfbox:${pdfBoxVersion}")
implementation("org.apache.pdfbox:pdfbox-tools:${pdfBoxVersion}")
implementation("com.fasterxml.jackson.module:jackson-module-afterburner:${jacksonVersion}")

View File

@ -17,4 +17,6 @@ public class LayoutParserSettings {
boolean debug;
LayoutParsingType layoutParsingTypeOverride;
String pdftronLicense;
int extractionThreads = 1;
}

View File

@ -14,39 +14,39 @@ import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.concurrent.atomic.AtomicReference;
import java.util.function.Function;
import java.util.stream.Collectors;
import org.apache.pdfbox.Loader;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.common.PDRectangle;
import org.apache.pdfbox.pdmodel.documentinterchange.markedcontent.PDMarkedContent;
import org.springframework.beans.factory.annotation.Value;
import org.springframework.stereotype.Service;
import com.iqser.red.service.redaction.v1.server.mapper.DocumentDataMapper;
import com.iqser.red.service.redaction.v1.server.model.document.nodes.ImageType;
import com.iqser.red.service.redaction.v1.server.model.document.nodes.NodeType;
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingFinishedEvent;
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingRequest;
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType;
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.Character;
import com.knecon.fforesight.service.layoutparser.processor.model.DocumentWithVisualization;
import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling;
import com.knecon.fforesight.service.layoutparser.processor.model.text.RedTextPosition;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextDirection;
import com.knecon.fforesight.service.layoutparser.processor.services.classification.ClassificationService;
import com.knecon.fforesight.service.layoutparser.processor.services.mapper.MarkdownMapper;
import com.knecon.fforesight.service.layoutparser.processor.docstrum.service.ReadingOrderService;
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationDocument;
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
import com.knecon.fforesight.service.layoutparser.processor.model.DocumentWithVisualization;
import com.knecon.fforesight.service.layoutparser.processor.model.PageContents;
import com.knecon.fforesight.service.layoutparser.processor.model.image.ClassifiedImage;
import com.knecon.fforesight.service.layoutparser.processor.model.outline.OutlineExtractorService;
import com.knecon.fforesight.service.layoutparser.processor.model.outline.SectionTree;
import com.knecon.fforesight.service.layoutparser.processor.model.outline.SectionTreeBuilderService;
import com.knecon.fforesight.service.layoutparser.processor.model.outline.SectionTreeEnhancementService;
import com.knecon.fforesight.service.layoutparser.processor.model.outline.SectionTree;
import com.knecon.fforesight.service.layoutparser.processor.model.table.Cell;
import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings;
import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling;
import com.knecon.fforesight.service.layoutparser.processor.model.table.TablePageBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.text.RedTextPosition;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextDirection;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.text.Word;
import com.knecon.fforesight.service.layoutparser.processor.python_api.adapter.CvTableParsingAdapter;
@ -56,24 +56,26 @@ import com.knecon.fforesight.service.layoutparser.processor.python_api.model.ima
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableCells;
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableServiceResponse;
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.VisualLayoutParsingResponse;
import com.knecon.fforesight.service.layoutparser.processor.services.PageContentExtractor;
import com.knecon.fforesight.service.layoutparser.processor.services.RulingCleaningService;
import com.knecon.fforesight.service.layoutparser.processor.services.SectionsBuilderService;
import com.knecon.fforesight.service.layoutparser.processor.services.SimplifiedSectionTextService;
import com.knecon.fforesight.service.layoutparser.processor.services.TableExtractionService;
import com.knecon.fforesight.service.layoutparser.processor.services.tables.TableExtractionService;
import com.knecon.fforesight.service.layoutparser.processor.services.TextRulingsClassifier;
import com.knecon.fforesight.service.layoutparser.processor.services.blockification.BlockificationPostprocessingService;
import com.knecon.fforesight.service.layoutparser.processor.services.blockification.DocstrumBlockificationService;
import com.knecon.fforesight.service.layoutparser.processor.services.blockification.DocuMineBlockificationService;
import com.knecon.fforesight.service.layoutparser.processor.services.blockification.RedactManagerBlockificationService;
import com.knecon.fforesight.service.layoutparser.processor.services.blockification.BlockificationService;
import com.knecon.fforesight.service.layoutparser.processor.services.classification.ClassificationService;
import com.knecon.fforesight.service.layoutparser.processor.services.factory.DocumentGraphFactory;
import com.knecon.fforesight.service.layoutparser.processor.services.graphics.Box;
import com.knecon.fforesight.service.layoutparser.processor.services.graphics.GraphicExtractorService;
import com.knecon.fforesight.service.layoutparser.processor.services.mapper.MarkdownMapper;
import com.knecon.fforesight.service.layoutparser.processor.services.mapper.TaasDocumentDataMapper;
import com.knecon.fforesight.service.layoutparser.processor.services.parsing.PDFLinesTextStripper;
import com.knecon.fforesight.service.layoutparser.processor.services.visualization.LayoutGridService;
import com.knecon.fforesight.service.layoutparser.processor.utils.MarkedContentUtils;
import com.knecon.fforesight.service.layoutparser.processor.utils.PageInformation;
import com.knecon.fforesight.service.layoutparser.processor.model.PageInformation;
import com.knecon.fforesight.service.layoutparser.processor.utils.TextPositionOperations;
import com.knecon.fforesight.service.ocr.v1.api.model.IdpResult;
import com.knecon.fforesight.service.ocr.v1.api.model.Table;
import io.micrometer.observation.Observation;
import io.micrometer.observation.ObservationRegistry;
@ -98,10 +100,8 @@ public class LayoutParsingPipeline {
final SimplifiedSectionTextService simplifiedSectionTextService;
final RulingCleaningService rulingCleaningService;
final TableExtractionService tableExtractionService;
final DocuMineBlockificationService docuMineBlockificationService;
final RedactManagerBlockificationService redactManagerBlockificationService;
final BlockificationService blockificationService;
final BlockificationPostprocessingService blockificationPostprocessingService;
final DocstrumBlockificationService docstrumBlockificationService;
final LayoutGridService layoutGridService;
final ObservationRegistry observationRegistry;
final VisualLayoutParsingAdapter visualLayoutParsingAdapter;
@ -111,11 +111,11 @@ public class LayoutParsingPipeline {
final SectionTreeEnhancementService sectionTreeEnhancementService;
final LayoutParserSettings settings;
final ClassificationService classificationService;
final ReadingOrderService readingOrderService;
@Value("${LAYOUT_PARSER_VERSION:}")
private String layoutParserVersion;
public LayoutParsingFinishedEvent parseLayoutAndSaveFilesToStorage(LayoutParsingRequest layoutParsingRequest) throws IOException {
long start = System.currentTimeMillis();
@ -134,14 +134,16 @@ public class LayoutParsingPipeline {
TableServiceResponse tableServiceResponse = layoutParsingRequest.tablesFileStorageId()
.map(layoutParsingStorageService::getTablesFile)
.orElse(new TableServiceResponse());
IdpResult idpResult = layoutParsingRequest.idpResultStorageId()
.map(layoutParsingStorageService::getIdpResultFile).orElse(IdpResult.empty());
LayoutParsingType layoutParsingType = settings.getLayoutParsingTypeOverride() == null //
? layoutParsingRequest.layoutParsingType() : settings.getLayoutParsingTypeOverride();
LayoutParsingType layoutParsingType = settings.getLayoutParsingTypeOverride() == null ? layoutParsingRequest.layoutParsingType() : settings.getLayoutParsingTypeOverride();
ClassificationDocument classificationDocument = parseLayout(layoutParsingType,
originFile,
imageServiceResponse,
tableServiceResponse,
idpResult,
visualLayoutParsingResponse,
layoutParsingRequest.identifier());
@ -159,7 +161,8 @@ public class LayoutParsingPipeline {
if (layoutParsingRequest.documentMarkdownFileStorageId()
.isPresent()) {
layoutParsingStorageService.storeMarkdownFile(layoutParsingRequest.documentMarkdownFileStorageId()
.get(), new MarkdownMapper().toMarkdownContent(documentWithVisualization.document()));
.get(),
new MarkdownMapper().toMarkdownContent(documentWithVisualization.document()));
}
layoutParsingStorageService.storeSimplifiedText(layoutParsingRequest, simplifiedSectionTextService.toSimplifiedText(documentWithVisualization.document()));
layoutParsingStorageService.storeViewerDocument(layoutParsingRequest, viewerDocumentFile);
@ -237,15 +240,22 @@ public class LayoutParsingPipeline {
File originFile,
ImageServiceResponse imageServiceResponse,
TableServiceResponse tableServiceResponse,
IdpResult idpResult,
VisualLayoutParsingResponse visualLayoutParsingResponse,
Map<String, String> identifier) {
PDDocument originDocument = openDocument(originFile);
addNumberOfPagesToTrace(originDocument.getNumberOfPages(), Files.size(originFile.toPath()));
PageContentExtractor extractor = new PageContentExtractor(originFile, settings.getExtractionThreads());
extractor.startAsync();
int pageCount = extractor.getPageCount();
addNumberOfPagesToTrace(pageCount, Files.size(originFile.toPath()));
Map<Integer, List<TableCells>> pdfTableCells = cvTableParsingAdapter.buildCvParsedTablesPerPage(tableServiceResponse);
Map<Integer, List<ClassifiedImage>> pdfImages = imageServiceResponseAdapter.buildClassifiedImagesPerPage(imageServiceResponse);
Map<Integer, List<ClassifiedImage>> pdfImages = imageServiceResponseAdapter.buildClassifiedImagesPerPage(imageServiceResponse, idpResult);
Map<Integer, List<ClassifiedImage>> signatures = visualLayoutParsingAdapter.buildExtractedSignaturesPerPage(visualLayoutParsingResponse);
Function<Table, Integer> pageNumberExtractor = table -> table.bboxes().get(0).pageNumber();
Map<Integer, List<Table>> idpTablesPerPage = idpResult.tables()
.stream()
.collect(Collectors.groupingBy(pageNumberExtractor));
ClassificationDocument classificationDocument = new ClassificationDocument();
@ -255,32 +265,20 @@ public class LayoutParsingPipeline {
List<ClassificationPage> classificationPages = new ArrayList<>();
classificationDocument.setOutlineObjectTree(outlineExtractorService.getOutlineObjectTree(originDocument));
long pageCount = originDocument.getNumberOfPages();
classificationDocument.setOutlineObjectTree(outlineExtractorService.getOutlineObjectTree(originFile));
for (int pageNumber = 1; pageNumber <= pageCount; pageNumber++) {
if (pageNumber % 100 == 0) {
// re-open document every once in a while to save on RAM. This has no significant performance impact.
// This is due to PDFBox caching all images and some other stuff with Soft References. This dereferences them and forces the freeing of memory.
originDocument.close();
originDocument = openDocument(originFile);
}
PageContents pageContents = extractor.awaitPageContents(pageNumber);
if (pageNumber % 100 == 0 || pageNumber == pageCount || pageNumber == 1) {
log.info("Extracting text on Page {} for {}", pageNumber, identifier);
log.info("Processing text on Page {} for {}", pageNumber, identifier);
}
classificationDocument.setPages(classificationPages);
PDFLinesTextStripper stripper = new PDFLinesTextStripper();
PDPage pdPage = originDocument.getPage(pageNumber - 1);
stripper.setPageNumber(pageNumber);
stripper.setStartPage(pageNumber);
stripper.setEndPage(pageNumber);
stripper.setPdpage(pdPage);
stripper.getText(originDocument);
List<Word> words = stripper.getWords();
List<Word> words = pageContents.getWords();
List<Ruling> rulings = pageContents.getRulings();
PageInformation pageInformation = pageContents.getPageInformation();
// rotateDirAdjExactly(words, pdPage); // works really well for many highly rotated documents (e.g. VV-331340.pdf), but it decreases the headline performance by 1.3%, so I am leaving it out for now
@ -291,39 +289,23 @@ public class LayoutParsingPipeline {
}
classificationDocument.getLayoutDebugLayer().addTextVisualizations(words, pageNumber);
PDRectangle pdr = pdPage.getMediaBox();
List<Ruling> rulings = stripper.getRulings();
classificationDocument.getLayoutDebugLayer().addRulingVisualization(rulings, pageNumber);
CleanRulings cleanRulings = rulingCleaningService.deduplicateAndStraightenRulings(pdfTableCells.get(pageNumber), rulings);
PageInformation pageInformation = PageInformation.fromPDPage(pageNumber, pdPage);
List<Cell> emptyTableCells = TableExtractionService.findCells(cleanRulings.getHorizontals(), cleanRulings.getVerticals(), pageInformation);
classificationDocument.getLayoutDebugLayer().addCellVisualizations(emptyTableCells, pageNumber);
classificationDocument.getLayoutDebugLayer().addCellVisualizations(emptyTableCells, pageNumber, null);
TextRulingsClassifier.classifyUnderlinedAndStrikethroughText(words, cleanRulings);
List<Box> graphics = graphicExtractorService.extractPathElementGraphics(originDocument, pdPage, pageNumber, cleanRulings, stripper.getWords(), false);
List<TablePageBlock> tables = tableExtractionService.extractTables(emptyTableCells, words, pageInformation, idpTablesPerPage.get(pageNumber), layoutParsingType, classificationDocument.getLayoutDebugLayer());
pdfImages.computeIfAbsent(pageNumber, x -> new ArrayList<>())
.addAll(graphics.stream()
.map(g -> new ClassifiedImage(new Rectangle2D.Double(g.x1, g.y1, g.width(), g.height()),
ImageType.GRAPHIC,
false,
stripper.getPageNumber(),
""))
.toList());
List<ClassifiedImage> graphics = graphicExtractorService.extractPathElementGraphics(pageContents.getGraphicBBoxes(), pageNumber, cleanRulings);
pdfImages.computeIfAbsent(pageNumber, x -> new ArrayList<>()).addAll(graphics);
ClassificationPage classificationPage = switch (layoutParsingType) {
case REDACT_MANAGER_OLD -> redactManagerBlockificationService.blockify(stripper.getWords(), cleanRulings, classificationDocument.getLayoutDebugLayer());
case DOCUMINE_OLD -> docuMineBlockificationService.blockify(words, cleanRulings);
case DOCUMINE, REDACT_MANAGER, REDACT_MANAGER_PARAGRAPH_DEBUG, REDACT_MANAGER_WITHOUT_DUPLICATE_PARAGRAPH ->
docstrumBlockificationService.blockify(words, cleanRulings, true, classificationDocument.getLayoutDebugLayer(), layoutParsingType);
case CLARIFYND, CLARIFYND_PARAGRAPH_DEBUG ->
docstrumBlockificationService.blockify(words, cleanRulings, false, classificationDocument.getLayoutDebugLayer(), layoutParsingType);
};
List<TextPageBlock> textBlocks = blockificationService.blockify(layoutParsingType, words, cleanRulings, classificationDocument.getLayoutDebugLayer());
updateClassificationPage(pdPage, pdr, classificationPage, cleanRulings, pageNumber, pageInformation);
List<AbstractPageBlock> blocks = readingOrderService.resolve(textBlocks, tables);
ClassificationPage classificationPage = new ClassificationPage(blocks, pageInformation, cleanRulings);
blockificationPostprocessingService.findHeadlinesFromOutline(classificationDocument, pageNumber, classificationPage, pageInformation);
@ -345,16 +327,12 @@ public class LayoutParsingPipeline {
}
}
tableExtractionService.extractTables(emptyTableCells, classificationPage);
buildPageStatistics(classificationPage);
increaseDocumentStatistics(classificationPage, classificationDocument);
classificationPages.add(classificationPage);
}
originDocument.close();
classificationService.classify(classificationDocument, layoutParsingType, identifier);
SectionTree sectionTree = sectionTreeBuilderService.createSectionTree(classificationDocument);
@ -371,24 +349,6 @@ public class LayoutParsingPipeline {
}
private static void updateClassificationPage(PDPage pdPage,
PDRectangle pdr,
ClassificationPage classificationPage,
CleanRulings cleanRulings,
int pageNumber,
PageInformation pageInformation) {
int rotation = pdPage.getRotation();
boolean isLandscape = pdr.getWidth() > pdr.getHeight() && (rotation == 0 || rotation == 180) || pdr.getHeight() > pdr.getWidth() && (rotation == 90 || rotation == 270);
classificationPage.setCleanRulings(cleanRulings);
classificationPage.setRotation(rotation);
classificationPage.setLandscape(isLandscape);
classificationPage.setPageNumber(pageNumber);
classificationPage.setPageWidth((float) pageInformation.width());
classificationPage.setPageHeight((float) pageInformation.height());
}
private static void rotateDirAdjExactly(List<Word> words, PDPage pdPage) {
for (TextDirection dir : TextDirection.values()) {

View File

@ -25,6 +25,7 @@ import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsi
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.image.ImageServiceResponse;
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableServiceResponse;
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.VisualLayoutParsingResponse;
import com.knecon.fforesight.service.ocr.v1.api.model.IdpResult;
import com.knecon.fforesight.service.viewerdoc.service.ViewerDocVersioningUtility;
import com.knecon.fforesight.tenantcommons.TenantContext;
@ -95,7 +96,23 @@ public class LayoutParsingStorageService {
}
@SneakyThrows
@SneakyThrows
public IdpResult getIdpResultFile(String storageId) {
if (!storageService.objectExists(TenantContext.getTenantId(), storageId)) {
return IdpResult.empty();
}
try (var idpResultStream = getObject(storageId)) {
IdpResult idpResult = objectMapper.readValue(idpResultStream, IdpResult.class);
idpResultStream.close();
return idpResult;
}
}
@SneakyThrows
public VisualLayoutParsingResponse getVisualLayoutParsingFile(String storageId) {
try (InputStream inputStream = getObject(storageId)) {

View File

@ -1,9 +1,7 @@
package com.knecon.fforesight.service.layoutparser.processor.docstrum;
import java.util.ArrayList;
import java.util.EnumMap;
import java.util.List;
import java.util.stream.Collectors;
import org.springframework.stereotype.Service;
@ -16,10 +14,8 @@ import com.knecon.fforesight.service.layoutparser.processor.docstrum.service.Rea
import com.knecon.fforesight.service.layoutparser.processor.docstrum.service.SpacingService;
import com.knecon.fforesight.service.layoutparser.processor.docstrum.service.ZoneBuilderService;
import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings;
import com.knecon.fforesight.service.layoutparser.processor.model.text.RedTextPosition;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextDirection;
import com.knecon.fforesight.service.layoutparser.processor.model.text.Word;
import com.knecon.fforesight.service.layoutparser.processor.visualization.LayoutDebugLayer;
import lombok.RequiredArgsConstructor;
@ -27,7 +23,6 @@ import lombok.RequiredArgsConstructor;
@RequiredArgsConstructor
public class DocstrumSegmentationService {
public static final double SAME_DIRECTION_THRESHOLD = 0.9;
private final NearestNeighbourService nearestNeighbourService;
private final SpacingService spacingService;
private final LineBuilderService lineBuilderService;
@ -35,52 +30,27 @@ public class DocstrumSegmentationService {
private final ReadingOrderService readingOrderService;
public List<Zone> segmentPage(List<Word> textPositions, boolean xyOrder, CleanRulings usedRulings, LayoutDebugLayer visualizations) {
public List<Zone> segmentPage(List<Word> words, boolean xyOrder, CleanRulings usedRulings) {
EnumMap<TextDirection, Integer> directionCounts = new EnumMap<>(TextDirection.class);
List<Zone> newZones = computeZones(textPositions, usedRulings, visualizations, TextDirection.ZERO);
directionCounts.put(TextDirection.ZERO, newZones.size());
List<Zone> newZones = computeZones(words, usedRulings, TextDirection.ZERO);
List<Zone> zones = new ArrayList<>(newZones);
newZones = computeZones(textPositions, usedRulings, visualizations, TextDirection.QUARTER_CIRCLE);
directionCounts.put(TextDirection.QUARTER_CIRCLE, newZones.size());
newZones = computeZones(words, usedRulings, TextDirection.QUARTER_CIRCLE);
zones.addAll(newZones);
newZones = computeZones(textPositions, usedRulings, visualizations, TextDirection.HALF_CIRCLE);
directionCounts.put(TextDirection.HALF_CIRCLE, newZones.size());
newZones = computeZones(words, usedRulings, TextDirection.HALF_CIRCLE);
zones.addAll(newZones);
newZones = computeZones(textPositions, usedRulings, visualizations, TextDirection.THREE_QUARTER_CIRCLE);
directionCounts.put(TextDirection.THREE_QUARTER_CIRCLE, newZones.size());
newZones = computeZones(words, usedRulings, TextDirection.THREE_QUARTER_CIRCLE);
zones.addAll(newZones);
return readingOrderService.resolve(zones, xyOrder, mostSameDirection(directionCounts));
return readingOrderService.resolve(zones, xyOrder);
}
private boolean mostSameDirection(EnumMap<TextDirection, Integer> directionCounts) {
private List<Zone> computeZones(List<Word> words, CleanRulings rulings, TextDirection direction) {
int total = directionCounts.values()
.stream()
.mapToInt(i -> i).sum();
if ((double) directionCounts.get(TextDirection.ZERO) / total > SAME_DIRECTION_THRESHOLD) {
return true;
} else if ((double) directionCounts.get(TextDirection.QUARTER_CIRCLE) / total > SAME_DIRECTION_THRESHOLD) {
return true;
} else if ((double) directionCounts.get(TextDirection.HALF_CIRCLE) / total > SAME_DIRECTION_THRESHOLD) {
return true;
} else if ((double) directionCounts.get(TextDirection.THREE_QUARTER_CIRCLE) / total > SAME_DIRECTION_THRESHOLD) {
return true;
}
return false;
}
private List<Zone> computeZones(List<Word> textPositions, CleanRulings rulings, LayoutDebugLayer visualizations, TextDirection direction) {
List<Character> characters = textPositions.stream()
List<Character> characters = words.stream()
.filter(t -> t.getDir() == direction)
.map(Word::getCharacters)
.flatMap(List::stream)

View File

@ -1,7 +1,6 @@
package com.knecon.fforesight.service.layoutparser.processor.docstrum.model;
import java.awt.geom.Rectangle2D;
import java.util.Comparator;
import java.util.List;
import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations;
@ -25,8 +24,6 @@ public abstract class BoundingBox {
// Also, these are definitely correct and should be used whenever possible.
protected Rectangle2D bBoxPdf;
protected static final float VERTICAL_COMPARISON_THRESHOLD = 0.4f;
public double getX() {
@ -204,23 +201,22 @@ public abstract class BoundingBox {
}
public double verticalOverlap(BoundingBox other) {
public double verticalOverlapPdf(BoundingBox other) {
return Math.max(0, Math.min(this.getPdfMaxY(), other.getPdfMaxY()) - Math.max(this.getPdfMinY(), other.getPdfMinY()));
}
public static final Comparator<BoundingBox> ILL_DEFINED_ORDER = (o1, o2) -> {
public double verticalOverlap(BoundingBox other) {
if (o1.equals(o2)) {
return 0;
}
if (o1.verticalOverlap(o2) > VERTICAL_COMPARISON_THRESHOLD * ((o1.getHeight() + o2.getHeight()) / 2)) {
return Double.compare(o1.getPdfMinX(), o2.getPdfMinX());
} else {
return Double.compare(o1.getPdfMaxY(), o2.getPdfMaxY());
}
};
return Math.max(0, Math.min(this.getMaxY(), other.getMaxY()) - Math.max(this.getMinY(), other.getMinY()));
}
public double horizontalOverlap(BoundingBox other) {
return Math.max(0, Math.min(this.getMaxX(), other.getMaxX()) - Math.max(this.getMinX(), other.getMinX()));
}
public double horizontalDistance(BoundingBox other) {
@ -276,4 +272,13 @@ public abstract class BoundingBox {
return this.intersectsX(other) && this.getMinY() >= other.getMaxY();
}
public double intersectedArea(BoundingBox r2) {
double xOverlap = horizontalOverlap(r2);
double yOverlap = verticalOverlap(r2);
return xOverlap * yOverlap;
}
}

View File

@ -2,7 +2,9 @@ package com.knecon.fforesight.service.layoutparser.processor.docstrum.model;
import java.awt.geom.Rectangle2D;
import java.util.List;
import java.util.Set;
import java.util.Map;
import java.util.Optional;
import java.util.function.Function;
import java.util.stream.Collectors;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextDirection;
@ -36,19 +38,16 @@ public abstract class TextBoundingBox extends BoundingBox {
.map(TextBoundingBox::getBBoxDirAdj)
.collect(RectangleTransformations.collectBBox());
Set<TextDirection> textDirections = components.stream()
Optional<TextDirection> mostCommonDir = components.stream()
.filter(c -> c instanceof TextBoundingBox)
.map(c -> (TextBoundingBox) c)
.map(TextBoundingBox::getDir)
.collect(Collectors.toSet());
.collect(Collectors.groupingBy(Function.identity(), Collectors.counting())).entrySet()
.stream()
.max(Map.Entry.comparingByValue())
.map(Map.Entry::getKey);
if (textDirections.isEmpty()) {
dir = TextDirection.ZERO;
} else if (textDirections.size() > 1) {
throw new IllegalArgumentException("More than one text direction found");
} else {
dir = textDirections.iterator().next();
}
dir = mostCommonDir.orElse(TextDirection.ZERO);
}

View File

@ -9,7 +9,7 @@ import org.springframework.stereotype.Service;
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.AngleFilter;
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.Character;
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.Line;
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.UnionFind;
import com.knecon.fforesight.service.layoutparser.processor.model.UnionFind;
import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings;
@Service

View File

@ -3,6 +3,7 @@ package com.knecon.fforesight.service.layoutparser.processor.docstrum.service;
import java.awt.geom.Rectangle2D;
import java.util.ArrayList;
import java.util.Comparator;
import java.util.EnumMap;
import java.util.HashMap;
import java.util.List;
import java.util.ListIterator;
@ -12,25 +13,43 @@ import java.util.stream.Collectors;
import org.springframework.stereotype.Service;
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.TextBoundingBox;
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.Zone;
import com.knecon.fforesight.service.layoutparser.processor.docstrum.utils.DoubleUtils;
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.table.TablePageBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextDirection;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
@Service
public class ReadingOrderService {
private static final double THRESHOLD = 5;
public static final double MULTI_COLUMN_DETECTION_THRESHOLD = 1.5;
public static final double SAME_DIRECTION_THRESHOLD = 0.9;
private static final Comparator<TextBoundingBox> COMPARATOR = //
Comparator.comparing(TextBoundingBox::getY, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD))
.thenComparing(TextBoundingBox::getX, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD));
Comparator.comparing(TextBoundingBox::getY,
(o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD))
.thenComparing(TextBoundingBox::getX,
(o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD));
private static final Comparator<TextBoundingBox> COMPARATOR_DIR_ADJ = //
Comparator.comparing(TextBoundingBox::getYDirAdj, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD))
Comparator.comparing(TextBoundingBox::getYDirAdj,
(o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD))
.thenComparing(TextBoundingBox::getXDirAdj, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD));
public List<Zone> resolve(List<Zone> zones, boolean xyReadingOrder, boolean useDirAdjCoords) {
public List<AbstractPageBlock> resolve(List<TextPageBlock> textBlocks, List<TablePageBlock> tables) {
List<AbstractPageBlock> unsortedBlocks = new ArrayList<>(textBlocks.size() + tables.size());
unsortedBlocks.addAll(textBlocks);
unsortedBlocks.addAll(tables);
return resolve(unsortedBlocks, false);
}
public <T extends TextBoundingBox> List<T> resolve(List<T> zones, boolean xyReadingOrder) {
boolean useDirAdjCoords = mostSameDirection(zones);
if (zones.isEmpty() || zones.size() == 1) {
return zones;
@ -41,7 +60,7 @@ public class ReadingOrderService {
}
Map<Long, Integer> histogram = new HashMap<>();
for (Zone zone : zones) {
for (TextBoundingBox zone : zones) {
Rectangle2D bbox = useDirAdjCoords ? zone.getBBoxDirAdj() : zone.getBBox();
long minY = Math.round(bbox.getMinY());
long maxY = Math.round(bbox.getMaxY());
@ -52,8 +71,7 @@ public class ReadingOrderService {
if (histogram.values()
.stream()
.mapToInt(Integer::intValue).average()
.orElse(1) < MULTI_COLUMN_DETECTION_THRESHOLD) {
.mapToInt(Integer::intValue).average().orElse(1) < MULTI_COLUMN_DETECTION_THRESHOLD) {
return resolveSingleColumnReadingOrder(zones, useDirAdjCoords);
} else {
@ -63,7 +81,7 @@ public class ReadingOrderService {
}
private static List<Zone> resolveSingleColumnReadingOrder(List<Zone> zones, boolean useDirAdjCoords) {
private static <T extends TextBoundingBox> List<T> resolveSingleColumnReadingOrder(List<T> zones, boolean useDirAdjCoords) {
if (useDirAdjCoords) {
return zones.stream()
@ -71,7 +89,7 @@ public class ReadingOrderService {
.stream()
.flatMap(words -> words.stream()
.sorted(COMPARATOR_DIR_ADJ))
.toList();
.collect(Collectors.toList());
}
zones.sort(COMPARATOR);
@ -79,7 +97,7 @@ public class ReadingOrderService {
}
private List<Zone> resolveMultiColumnReadingOder(List<Zone> zones, boolean useDirAdjCoords) {
private <T extends TextBoundingBox> List<T> resolveMultiColumnReadingOder(List<T> zones, boolean useDirAdjCoords) {
// Simple reading order resolver for multi column page layout as described here : https://pub.towardsai.net/advanced-rag-02-unveiling-pdf-parsing-b84ae866344e
// TODO implement a more fancy reading order resolver see https://github.com/BobLd/DocumentLayoutAnalysis/blob/master/README.md#reading-order
@ -87,7 +105,7 @@ public class ReadingOrderService {
double minX = Double.POSITIVE_INFINITY;
double maxX = Double.NEGATIVE_INFINITY;
for (Zone zone : zones) {
for (T zone : zones) {
Rectangle2D bbox = useDirAdjCoords ? zone.getBBoxDirAdj() : zone.getBBox();
if (bbox.getX() < minX) {
minX = zone.getXDirAdj();
@ -99,11 +117,11 @@ public class ReadingOrderService {
double midLineXCoordinate = (minX + maxX) / 2;
List<Zone> leftOf = new ArrayList<>();
List<Zone> rightOf = new ArrayList<>();
List<Zone> middle = new ArrayList<>();
List<T> leftOf = new ArrayList<>();
List<T> rightOf = new ArrayList<>();
List<T> middle = new ArrayList<>();
for (Zone zone : zones) {
for (T zone : zones) {
Rectangle2D bbox = useDirAdjCoords ? zone.getBBoxDirAdj() : zone.getBBox();
if (bbox.getX() < midLineXCoordinate && bbox.getX() + bbox.getWidth() < midLineXCoordinate) {
leftOf.add(zone);
@ -166,14 +184,14 @@ public class ReadingOrderService {
middle.addAll(leftNotIntersecting);
middle.addAll(rightNotIntersecting);
*/
List<Zone> sortedZones = new ArrayList<>();
List<T> sortedZones = new ArrayList<>();
sortedZones.addAll(leftOf);
sortedZones.addAll(rightOf);
ListIterator<Zone> itty = middle.listIterator();
ListIterator<T> itty = middle.listIterator();
while (itty.hasNext()) {
Zone current = itty.next();
T current = itty.next();
Rectangle2D bbox = useDirAdjCoords ? current.getBBoxDirAdj() : current.getBBox();
for (int i = 0; i < sortedZones.size(); i++) {
if (bbox.getY() < sortedZones.get(i).getY()) {
@ -189,4 +207,29 @@ public class ReadingOrderService {
return sortedZones;
}
private boolean mostSameDirection(List<? extends TextBoundingBox> zones) {
EnumMap<TextDirection, Integer> directionCounts = new EnumMap<>(TextDirection.class);
for (TextBoundingBox zone : zones) {
TextDirection dir = zone.getDir();
directionCounts.put(dir, directionCounts.getOrDefault(dir, 0) + 1);
}
int total = directionCounts.values()
.stream()
.mapToInt(i -> i).sum();
if ((double) directionCounts.getOrDefault(TextDirection.ZERO, 0) / total > SAME_DIRECTION_THRESHOLD) {
return true;
} else if ((double) directionCounts.getOrDefault(TextDirection.QUARTER_CIRCLE, 0) / total > SAME_DIRECTION_THRESHOLD) {
return true;
} else if ((double) directionCounts.getOrDefault(TextDirection.HALF_CIRCLE, 0) / total > SAME_DIRECTION_THRESHOLD) {
return true;
} else if ((double) directionCounts.getOrDefault(TextDirection.THREE_QUARTER_CIRCLE, 0) / total > SAME_DIRECTION_THRESHOLD) {
return true;
}
return false;
}
}

View File

@ -10,7 +10,7 @@ import java.util.stream.Collectors;
import org.springframework.stereotype.Service;
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.Line;
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.UnionFind;
import com.knecon.fforesight.service.layoutparser.processor.model.UnionFind;
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.Zone;
import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings;
import com.knecon.fforesight.service.layoutparser.processor.model.text.Word;

View File

@ -1,4 +1,4 @@
package com.knecon.fforesight.service.layoutparser.processor.services;
package com.knecon.fforesight.service.layoutparser.processor.experimental;
import java.awt.geom.Line2D;
import java.awt.geom.Rectangle2D;
@ -7,7 +7,6 @@ import java.util.LinkedList;
import java.util.List;
import java.util.stream.Stream;
import com.knecon.fforesight.service.layoutparser.processor.model.GapInformation;
import com.knecon.fforesight.service.layoutparser.processor.model.PageContents;
import lombok.experimental.UtilityClass;
@ -23,13 +22,13 @@ public class DividingColumnDetectionService {
public List<Rectangle2D> detectColumns(PageContents pageContents) {
if (pageContents.getSortedWords().size() < 2) {
return List.of(pageContents.getCropBox());
if (pageContents.getWords().size() < 2) {
return List.of(pageContents.getPageInformation().cropBox());
}
GapInformation linesWithGapInformation = GapDetectionService.findGapsInLines(pageContents.getSortedWords(), pageContents.getCropBox());
GapInformation linesWithGapInformation = GapDetectionService.findGapsInLines(pageContents.getWords(), pageContents.getPageInformation().cropBox());
return detectColumnsFromLines(linesWithGapInformation.getXGaps(), pageContents.getCropBox());
return detectColumnsFromLines(linesWithGapInformation.getXGaps(), pageContents.getPageInformation().cropBox());
}

View File

@ -1,10 +1,9 @@
package com.knecon.fforesight.service.layoutparser.processor.services;
package com.knecon.fforesight.service.layoutparser.processor.experimental;
import java.awt.geom.Rectangle2D;
import java.util.LinkedList;
import java.util.List;
import com.knecon.fforesight.service.layoutparser.processor.model.GapInformation;
import com.knecon.fforesight.service.layoutparser.processor.model.text.Word;
import lombok.AllArgsConstructor;

View File

@ -1,4 +1,4 @@
package com.knecon.fforesight.service.layoutparser.processor.model;
package com.knecon.fforesight.service.layoutparser.processor.experimental;
import java.awt.geom.Rectangle2D;
import java.util.LinkedList;

View File

@ -1,4 +1,4 @@
package com.knecon.fforesight.service.layoutparser.processor.services;
package com.knecon.fforesight.service.layoutparser.processor.experimental;
import java.awt.geom.Rectangle2D;
import java.awt.geom.RectangularShape;
@ -6,9 +6,6 @@ import java.util.LinkedList;
import java.util.List;
import java.util.Queue;
import java.util.stream.Stream;
import com.iqser.red.commons.jackson.ObjectMapperFactory;
import com.knecon.fforesight.service.layoutparser.processor.model.GapInformation;
import lombok.AllArgsConstructor;
import lombok.Getter;

View File

@ -1,12 +1,10 @@
package com.knecon.fforesight.service.layoutparser.processor.services;
package com.knecon.fforesight.service.layoutparser.processor.experimental;
import java.awt.geom.Rectangle2D;
import java.awt.geom.RectangularShape;
import java.util.LinkedList;
import java.util.List;
import com.knecon.fforesight.service.layoutparser.processor.model.GapInformation;
import com.knecon.fforesight.service.layoutparser.processor.model.LineInformation;
import com.knecon.fforesight.service.layoutparser.processor.model.text.Word;
import lombok.experimental.UtilityClass;

View File

@ -1,11 +1,9 @@
package com.knecon.fforesight.service.layoutparser.processor.services;
package com.knecon.fforesight.service.layoutparser.processor.experimental;
import java.awt.geom.Rectangle2D;
import java.util.LinkedList;
import java.util.List;
import com.knecon.fforesight.service.layoutparser.processor.model.GapInformation;
import com.knecon.fforesight.service.layoutparser.processor.model.LineInformation;
import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations;
import com.knecon.fforesight.service.layoutparser.processor.model.text.Word;

View File

@ -1,4 +1,4 @@
package com.knecon.fforesight.service.layoutparser.processor.model;
package com.knecon.fforesight.service.layoutparser.processor.experimental;
import java.awt.geom.Rectangle2D;
import java.util.List;

View File

@ -1,12 +1,15 @@
package com.knecon.fforesight.service.layoutparser.processor.model;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import com.fasterxml.jackson.annotation.JsonIgnore;
import com.iqser.red.service.redaction.v1.server.model.document.nodes.LayoutEngine;
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.BoundingBox;
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.TextBoundingBox;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.text.Word;
import lombok.AllArgsConstructor;
import lombok.Data;
@ -17,18 +20,18 @@ import lombok.NoArgsConstructor;
@AllArgsConstructor
@NoArgsConstructor
@EqualsAndHashCode(callSuper = true)
public abstract class AbstractPageBlock extends BoundingBox {
public abstract class AbstractPageBlock extends TextBoundingBox {
@JsonIgnore
protected PageBlockType classification;
Set<LayoutEngine> engines = new HashSet<>();
protected Set<LayoutEngine> engines = new HashSet<>();
@JsonIgnore
protected int page;
@JsonIgnore
private Orientation orientation = Orientation.NONE;
protected Orientation orientation = Orientation.NONE;
public abstract String getText();
@ -42,4 +45,6 @@ public abstract class AbstractPageBlock extends BoundingBox {
public abstract boolean isEmpty();
public abstract List<Word> getWords();
}

View File

@ -5,6 +5,8 @@ import java.util.List;
import com.knecon.fforesight.service.layoutparser.processor.model.outline.OutlineObjectTree;
import com.knecon.fforesight.service.layoutparser.processor.model.outline.SectionTree;
import com.knecon.fforesight.service.layoutparser.processor.model.text.ClassificationFooter;
import com.knecon.fforesight.service.layoutparser.processor.model.text.ClassificationHeader;
import com.knecon.fforesight.service.layoutparser.processor.model.text.StringFrequencyCounter;
import com.knecon.fforesight.service.layoutparser.processor.model.text.UnclassifiedText;
import com.knecon.fforesight.service.layoutparser.processor.visualization.LayoutDebugLayer;

View File

@ -1,5 +1,6 @@
package com.knecon.fforesight.service.layoutparser.processor.model;
import java.awt.geom.AffineTransform;
import java.awt.geom.Rectangle2D;
import java.util.ArrayList;
import java.util.HashMap;
@ -11,29 +12,38 @@ import com.knecon.fforesight.service.layoutparser.processor.model.image.Classifi
import com.knecon.fforesight.service.layoutparser.processor.model.outline.OutlineObject;
import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings;
import com.knecon.fforesight.service.layoutparser.processor.model.text.StringFrequencyCounter;
import com.knecon.fforesight.service.layoutparser.processor.utils.CoordinateTransforms;
import lombok.Data;
import lombok.NonNull;
import lombok.RequiredArgsConstructor;
@Data
@RequiredArgsConstructor
public class ClassificationPage {
public ClassificationPage(List<AbstractPageBlock> pageBlocks, PageInformation pageInformation, CleanRulings cleanRulings) {
this.cleanRulings = cleanRulings;
this.pageNumber = pageInformation.number();
this.textBlocks = pageBlocks;
var mediaBox = pageInformation.mediabox();
int rotation = pageInformation.rotationDegrees();
this.landscape = mediaBox.getWidth() > mediaBox.getHeight() && (rotation == 0 || rotation == 180) //
|| mediaBox.getHeight() > mediaBox.getWidth() && (rotation == 90 || rotation == 270);
this.pageInformation = pageInformation;
}
private PageInformation pageInformation;
@NonNull
private List<AbstractPageBlock> textBlocks;
private List<OutlineObject> outlineObjects = new ArrayList<>();
private List<AbstractPageBlock> headlines = new ArrayList<>();
private List<ClassifiedImage> images = new ArrayList<>();
private Rectangle bodyTextFrame;
private boolean landscape;
private int rotation;
private int pageNumber;
@ -42,11 +52,32 @@ public class ClassificationPage {
private StringFrequencyCounter fontCounter = new StringFrequencyCounter();
private StringFrequencyCounter fontStyleCounter = new StringFrequencyCounter();
private float pageWidth;
private float pageHeight;
private CleanRulings cleanRulings;
private Map<String, List<Rectangle2D>> markedContentBboxPerType = new HashMap<>();
public AffineTransform getPdfToPageTransform() {
return CoordinateTransforms.calculateInitialUserSpaceCoordsToPageCoords(getPageInformation());
}
public int getRotation() {
return pageInformation.rotationDegrees();
}
public float getPageWidth() {
return (float) pageInformation.width();
}
public float getPageHeight() {
return (float) pageInformation.height();
}
}

View File

@ -1,10 +1,10 @@
package com.knecon.fforesight.service.layoutparser.processor.model;
import java.awt.geom.Rectangle2D;
import java.util.List;
import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling;
import com.knecon.fforesight.service.layoutparser.processor.model.text.Word;
import com.knecon.fforesight.service.layoutparser.processor.services.graphics.Box;
import lombok.AllArgsConstructor;
import lombok.Builder;
@ -15,8 +15,9 @@ import lombok.Getter;
@AllArgsConstructor
public class PageContents {
List<Word> sortedWords;
Rectangle2D cropBox;
Rectangle2D mediaBox;
PageInformation pageInformation;
List<Word> words;
List<Ruling> rulings;
List<Box> graphicBBoxes;
}

View File

@ -2,16 +2,63 @@ package com.knecon.fforesight.service.layoutparser.processor.model;
import java.awt.geom.Rectangle2D;
import lombok.AllArgsConstructor;
import lombok.Getter;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.common.PDRectangle;
@Getter
@AllArgsConstructor
public class PageInformation {
import com.iqser.red.service.redaction.v1.server.model.document.nodes.Page;
PageContents pageContents;
LineInformation lineInformation;
Rectangle2D mainBodyTextFrame;
GapInformation gapInformation;
public record PageInformation(Rectangle2D mediabox, Rectangle2D cropBox, int number, int rotationDegrees) {
}
public static PageInformation fromPDPage(int pageNum, PDPage page) {
PDRectangle mediaBox = page.getMediaBox();
PDRectangle cropBox = page.getCropBox();
return new PageInformation(new Rectangle2D.Double(mediaBox.getLowerLeftX(), mediaBox.getLowerLeftY(), mediaBox.getWidth(), mediaBox.getHeight()),
new Rectangle2D.Double(cropBox.getLowerLeftX(), cropBox.getLowerLeftY(), cropBox.getWidth(), cropBox.getHeight()),
pageNum,
page.getRotation());
}
public static PageInformation fromPage(Page page) {
return new PageInformation(new Rectangle2D.Double(0, 0, page.getWidth(), page.getHeight()),
new Rectangle2D.Double(0, 0, page.getWidth(), page.getHeight()),
page.getNumber(),
page.getRotation());
}
public double height() {
return mediabox.getHeight();
}
public double heightRot() {
if (rotationDegrees == 90 || rotationDegrees == 270) {
return width();
}
return height();
}
public double width() {
return mediabox.getWidth();
}
public double minX() {
return mediabox.getX();
}
public double minY() {
return mediabox.getY();
}
}

View File

@ -4,6 +4,7 @@ import java.util.Collections;
import java.util.LinkedList;
import java.util.List;
import java.util.Locale;
import java.util.Objects;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
@ -77,7 +78,7 @@ public class SectionIdentifier {
List<Integer> identifiers = new LinkedList<>();
for (int i = 1; i <= 4; i++) {
String numericalIdentifier = numericalIdentifierMatcher.group(i);
if (numericalIdentifier == null || numericalIdentifier.equals("0") || numericalIdentifier.isEmpty() || numericalIdentifier.isBlank()) {
if (numericalIdentifier == null || Objects.equals(numericalIdentifier, "0") || numericalIdentifier.isBlank()) {
break;
}
identifiers.add(Integer.parseInt(numericalIdentifier.trim()));

View File

@ -1,4 +1,4 @@
package com.knecon.fforesight.service.layoutparser.processor.docstrum.model;
package com.knecon.fforesight.service.layoutparser.processor.model;
import java.util.Collection;
import java.util.LinkedHashMap;

View File

@ -2,12 +2,14 @@ package com.knecon.fforesight.service.layoutparser.processor.model.outline;
import java.awt.geom.AffineTransform;
import java.awt.geom.Point2D;
import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.Locale;
import java.util.Optional;
import org.apache.pdfbox.Loader;
import org.apache.pdfbox.cos.COSArray;
import org.apache.pdfbox.cos.COSBase;
import org.apache.pdfbox.cos.COSDictionary;
@ -28,7 +30,7 @@ import org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDOutlin
import org.springframework.stereotype.Service;
import com.knecon.fforesight.service.layoutparser.processor.utils.CoordinateTransforms;
import com.knecon.fforesight.service.layoutparser.processor.utils.PageInformation;
import com.knecon.fforesight.service.layoutparser.processor.model.PageInformation;
import lombok.SneakyThrows;
import lombok.extern.slf4j.Slf4j;
@ -48,19 +50,22 @@ public class OutlineExtractorService {
@SneakyThrows
public OutlineObjectTree getOutlineObjectTree(PDDocument document) {
public OutlineObjectTree getOutlineObjectTree(File documentFile) {
PDDocumentOutline documentOutline = document.getDocumentCatalog().getDocumentOutline();
try (var document = Loader.loadPDF(documentFile)) {
List<OutlineObjectTreeNode> rootNodes = new ArrayList<>();
if (documentOutline != null) {
for (PDOutlineItem child : documentOutline.children()) {
Optional<OutlineObjectTreeNode> outlineObjectWithChildren = createOutlineObjectWithChildren(child, document, 1);
outlineObjectWithChildren.ifPresent(rootNodes::add);
PDDocumentOutline documentOutline = document.getDocumentCatalog().getDocumentOutline();
List<OutlineObjectTreeNode> rootNodes = new ArrayList<>();
if (documentOutline != null) {
for (PDOutlineItem child : documentOutline.children()) {
Optional<OutlineObjectTreeNode> outlineObjectWithChildren = createOutlineObjectWithChildren(child, document, 1);
outlineObjectWithChildren.ifPresent(rootNodes::add);
}
}
}
return new OutlineObjectTree(rootNodes);
return new OutlineObjectTree(rootNodes);
}
}
@ -128,9 +133,7 @@ public class OutlineExtractorService {
log.info(String.format("Error occurred during position resolution for outline item on page %s with title %s: " + e, pageNumber, title));
}
return Optional.of(new OutlineObjectTreeNode(new OutlineObject(title,
pageNumber,
transformPointToPageCoords(outlinePosition, userSpaceToPageCoords), depth)));
return Optional.of(new OutlineObjectTreeNode(new OutlineObject(title, pageNumber, transformPointToPageCoords(outlinePosition, userSpaceToPageCoords), depth)));
}

View File

@ -10,8 +10,8 @@ import org.springframework.stereotype.Service;
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationDocument;
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationFooter;
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationHeader;
import com.knecon.fforesight.service.layoutparser.processor.model.text.ClassificationFooter;
import com.knecon.fforesight.service.layoutparser.processor.model.text.ClassificationHeader;
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
import com.knecon.fforesight.service.layoutparser.processor.model.PageBlockType;
import com.knecon.fforesight.service.layoutparser.processor.model.image.ClassifiedImage;

View File

@ -4,13 +4,15 @@ import java.awt.geom.AffineTransform;
import java.awt.geom.Point2D;
import java.awt.geom.Rectangle2D;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.Collection;
import java.util.List;
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.BoundingBox;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.text.Word;
import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations;
import com.knecon.fforesight.service.layoutparser.processor.utils.TextNormalizationUtilities;
import com.knecon.fforesight.service.ocr.v1.api.model.TableCell;
import lombok.Data;
import lombok.EqualsAndHashCode;
@ -22,7 +24,7 @@ import lombok.NoArgsConstructor;
@NoArgsConstructor
public class Cell extends BoundingBox {
private List<TextPageBlock> textBlocks = new ArrayList<>();
private List<AbstractPageBlock> textBlocks = new ArrayList<>();
private List<Cell> headerCells = new ArrayList<>();
@ -33,17 +35,41 @@ public class Cell extends BoundingBox {
private int pageNumber;
public Cell(Point2D topLeft, Point2D bottomRight) {
public Cell(Point2D topLeft, Point2D bottomRight, AffineTransform pdfToPageTransform) {
this.bBoxPdf = new Rectangle2D.Double(topLeft.getX(), topLeft.getY(), (bottomRight.getX() - topLeft.getX()), (bottomRight.getY() - topLeft.getY()));
this.bBox = bBoxPdf;
this.bBox = RectangleTransformations.transform(bBoxPdf, pdfToPageTransform);
}
public Cell(Rectangle2D bBoxInitialUserSpace, AffineTransform initialUserSpaceToJava) {
public static Cell fromPageCoordinates(Point2D topLeft, Point2D bottomRight, AffineTransform pageToPdfTransform) {
var bBox = new Rectangle2D.Double(topLeft.getX(), topLeft.getY(), (bottomRight.getX() - topLeft.getX()), (bottomRight.getY() - topLeft.getY()));
return fromPageCoordinates(bBox, pageToPdfTransform);
}
public static Cell fromPageCoordinates(Rectangle2D r, AffineTransform pageToPdfTransform) {
Cell cell = new Cell();
var bBoxPdf = RectangleTransformations.transform(r, pageToPdfTransform);
cell.bBox = r;
cell.bBoxPdf = bBoxPdf;
return cell;
}
public Cell(TableCell tableCell, AffineTransform pdfToPageTransform) {
this.bBoxPdf = tableCell.textRegion().region().bbox().get().getBounds2D();
this.bBox = RectangleTransformations.transform(bBoxPdf, pdfToPageTransform);
}
public Cell(Rectangle2D bBoxInitialUserSpace, AffineTransform pdfToPageTransform) {
this.bBoxPdf = bBoxInitialUserSpace;
this.bBox = initialUserSpaceToJava.createTransformedShape(bBoxInitialUserSpace).getBounds2D();
this.bBox = RectangleTransformations.transform(bBoxPdf, pdfToPageTransform);
}
@ -56,9 +82,12 @@ public class Cell extends BoundingBox {
}
public void addTextBlock(TextPageBlock textBlock) {
public List<Word> getWords() {
textBlocks.add(textBlock);
return getTextBlocks().stream()
.map(AbstractPageBlock::getWords)
.flatMap(Collection::stream)
.toList();
}
@ -67,24 +96,12 @@ public class Cell extends BoundingBox {
StringBuilder sb = new StringBuilder();
Iterator<TextPageBlock> itty = textBlocks.iterator();
Word previous = null;
while (itty.hasNext()) {
TextPageBlock textBlock = itty.next();
for (Word word : textBlock.getWords()) {
if (previous != null) {
if (Math.abs(previous.getMaxYDirAdj() - word.getMaxYDirAdj()) > word.getTextHeight()) {
sb.append('\n');
} else {
sb.append(' ');
}
}
sb.append(word.toString());
previous = word;
for (int i = 0; i < textBlocks.size(); i++) {
AbstractPageBlock textBlock = textBlocks.get(i);
sb.append(textBlock);
if (i < textBlocks.size() - 1) {
sb.append("\n");
}
}
return TextNormalizationUtilities.cleanString(sb.toString());

View File

@ -22,6 +22,12 @@ public class CleanRulings {
List<Ruling> verticals; // unmodifiable sorted by X list
public static CleanRulings empty() {
return new CleanRulings(Collections.emptyList(), Collections.emptyList());
}
public CleanRulings(List<Ruling> horizontals, List<Ruling> verticals) {
this.horizontals = horizontals.stream()

View File

@ -30,15 +30,24 @@ public class Ruling extends Line2D.Float {
OTHER
}
public enum Style {
SOLID,
DASHED
}
@Getter
@Setter
private Classification classification;
@Getter
@Setter
private Style style;
public Ruling(Point2D p1, Point2D p2) {
super(p1, p2);
this.classification = Classification.OTHER;
this.style = Style.SOLID;
}

View File

@ -1,48 +1,48 @@
package com.knecon.fforesight.service.layoutparser.processor.model.table;
import java.awt.geom.Point2D;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashSet;
import java.util.Collection;
import java.util.List;
import java.util.Set;
import java.util.TreeMap;
import java.util.function.Function;
import java.util.stream.Collectors;
import java.util.stream.Stream;
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.BoundingBox;
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.PageBlockType;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations;
import com.knecon.fforesight.service.layoutparser.processor.model.text.Word;
import lombok.Getter;
import lombok.Setter;
import lombok.extern.slf4j.Slf4j;
@Slf4j
@Getter
public class TablePageBlock extends AbstractPageBlock {
public static final double CELL_AREA_CONTAINED_THRESHOLD = 0.98;
private final TreeMap<CellPosition, Cell> cellTreeMap = new TreeMap<>();
private final TextPageBlock caption;
private final int rotation;
@Getter
@Setter
private String headline;
private int unrotatedRowCount;
private int unrotatedColCount;
private List<List<Cell>> rows;
@Getter
@Setter
private List<Cell> cells;
private final List<List<Cell>> rows;
public TablePageBlock(List<Cell> cells, int rotation) {
public TablePageBlock(TextPageBlock caption, List<List<Cell>> rows) {
setToBBoxOfComponents(cells);
this.cells = cells;
addCells(cells);
classification = PageBlockType.TABLE;
this.rotation = rotation;
this.classification = PageBlockType.TABLE;
this.caption = caption;
this.rows = rows;
setBBoxes();
}
private void setBBoxes() {
List<BoundingBox> components = Stream.of(getCells().stream(),
getCells().stream()
.map(Cell::getTextBlocks)
.flatMap(Collection::stream))
.flatMap(Function.identity())
.map(o -> (BoundingBox) o)
.toList();
setToBBoxOfComponents(components);
}
@ -53,28 +53,19 @@ public class TablePageBlock extends AbstractPageBlock {
}
public List<List<Cell>> getRows() {
if (rows == null) {
rows = computeRows();
// Ignore rows that does not contain any cells and values.
List<List<Cell>> rowsToRemove = new ArrayList<>();
for (List<Cell> row : rows) {
if (row.size() == 1 && row.get(0).getTextBlocks().isEmpty()) {
rowsToRemove.add(row);
}
}
rows.removeAll(rowsToRemove);
computeHeaders();
}
return rows;
@Override
public List<Word> getWords() {
return getCells().stream()
.map(Cell::getTextBlocks)
.flatMap(Collection::stream)
.map(AbstractPageBlock::getWords)
.flatMap(Collection::stream)
.toList();
}
public int getRowCount() {
return getRows().size();
@ -85,259 +76,16 @@ public class TablePageBlock extends AbstractPageBlock {
return getRows().stream()
.mapToInt(List::size)
.max()
.orElse(0);
.max().orElse(0);
}
/**
* Detect header cells (either first row or first column):
* Column is marked as header if originalCell text is bold and row originalCell text is not bold.
* Defaults to row.
*/
private void computeHeaders() {
if (rows == null) {
rows = computeRows();
}
// A bold originalCell is a header originalCell as long as every originalCell to the left/top is bold, too
// we move from left to right and top to bottom
for (int rowIndex = 0; rowIndex < rows.size(); rowIndex++) {
List<Cell> rowCells = rows.get(rowIndex);
if (rowCells.size() == 1) {
continue;
}
for (int colIndex = 0; colIndex < rowCells.size(); colIndex++) {
Cell cell = rowCells.get(colIndex);
List<Cell> cellsToTheLeft = rowCells.subList(0, colIndex);
Cell lastHeaderCell = null;
for (Cell leftCell : cellsToTheLeft) {
if (leftCell.isHeaderCell()) {
lastHeaderCell = leftCell;
} else {
break;
}
}
if (lastHeaderCell != null) {
cell.getHeaderCells().add(lastHeaderCell);
}
List<Cell> cellsToTheTop = new ArrayList<>();
for (int i = 0; i < rowIndex; i++) {
try {
cellsToTheTop.add(rows.get(i)
.get(colIndex));
} catch (IndexOutOfBoundsException e) {
log.debug("No cell {} in row {}, ignoring.", colIndex, rowIndex);
}
}
for (Cell topCell : cellsToTheTop) {
if (topCell.isHeaderCell()) {
lastHeaderCell = topCell;
} else {
break;
}
}
if (lastHeaderCell != null) {
cell.getHeaderCells().add(lastHeaderCell);
}
if (!cell.getTextBlocks().isEmpty() && cell.getTextBlocks()
.get(0).getMostPopularWordStyle().equals("bold")) {
cell.setHeaderCell(true);
}
}
}
}
private List<List<Cell>> computeRows() {
List<List<Cell>> rows = new ArrayList<>();
if (rotation == 90) {
for (int i = 0; i < unrotatedColCount; i++) { // rows
List<Cell> lastRow = new ArrayList<>();
for (int j = unrotatedRowCount - 1; j >= 0; j--) { // cols
Cell cell = cellTreeMap.get(new CellPosition(j, i));
if (cell != null) {
lastRow.add(cell);
}
}
rows.add(lastRow);
}
} else if (rotation == 270) {
for (int i = unrotatedColCount - 1; i >= 0; i--) { // rows
List<Cell> lastRow = new ArrayList<>();
for (int j = 0; j < unrotatedRowCount; j++) { // cols
Cell cell = cellTreeMap.get(new CellPosition(j, i));
if (cell != null) {
lastRow.add(cell);
}
}
rows.add(lastRow);
}
} else {
for (int i = 0; i < unrotatedRowCount; i++) {
List<Cell> lastRow = new ArrayList<>();
for (int j = 0; j < unrotatedColCount; j++) {
Cell cell = cellTreeMap.get(new CellPosition(i, j)); // JAVA_8 use getOrDefault()
if (cell != null) {
lastRow.add(cell);
}
}
rows.add(lastRow);
}
}
return rows;
}
private void addCells(List<Cell> cells) {
if (cells.isEmpty()) {
return;
}
cells.removeIf(cell -> cell.getWidth() < 1.1 || cell.getHeight() < 1.1);
List<List<Cell>> rowsOfCellsMatrix = calculateTableStructure(cells);
for (int i = 0; i < rowsOfCellsMatrix.size(); i++) {
for (int j = 0; j < rowsOfCellsMatrix.get(i).size(); j++) {
addCellToRowAndCol(rowsOfCellsMatrix.get(i)
.get(j), i, j);
}
}
}
/**
* Calculates the structure of the table. For spanning rows and columns multiple cells with the same values will be inserted.
*
* @param cells The found cells
* @return TablePageBlock Structure as a rows of cells matrix
*/
private List<List<Cell>> calculateTableStructure(List<Cell> cells) {
if (cells.isEmpty()) {
return new ArrayList<>();
}
Set<Double> uniqueX = new HashSet<>();
Set<Double> uniqueY = new HashSet<>();
cells.stream()
.filter(c -> !c.getTextBlocks().isEmpty() || c.getHeight() > 3 && c.getWidth() > 3)
.forEach(c -> {
uniqueX.add(c.getPdfMinX());
uniqueX.add(c.getPdfMaxX());
uniqueY.add(c.getPdfMinY());
uniqueY.add(c.getPdfMaxY());
});
var sortedUniqueX = uniqueX.stream()
.sorted()
.toList();
var sortedUniqueY = uniqueY.stream()
.sorted()
.toList();
List<List<Cell>> rowsOfCells = new ArrayList<>();
Double prevY = null;
for (Double y : sortedUniqueY) {
List<Cell> row = new ArrayList<>();
Double prevX = null;
for (Double x : sortedUniqueX) {
if (prevY != null && prevX != null) {
var cellFromGridStructure = new Cell(new Point2D.Double(prevX, prevY), new Point2D.Double(x, y));
if (cellFromGridStructure.hasMinimumSize()) {
cells.stream()
.map(originalCell -> new CellWithIntersection(originalCell,
RectangleTransformations.calculateIntersectedArea(cellFromGridStructure.getBBoxPdf(),
originalCell.getBBoxPdf())))
.filter(cellWithIntersection -> cellWithIntersection.intersectedArea > 0)
.filter(cellWithIntersection -> cellWithIntersection.originalCell.getArea() > cellWithIntersection.intersectedArea * CELL_AREA_CONTAINED_THRESHOLD)
.max(Comparator.comparing(CellWithIntersection::intersectedArea))
.map(CellWithIntersection::originalCell)
.ifPresent(matchingCell -> cellFromGridStructure.getTextBlocks().addAll(matchingCell.getTextBlocks()));
row.add(cellFromGridStructure);
}
}
prevX = x;
}
// exclude empty rows and rows where all text blocks are empty
if (prevY != null && prevX != null && !row.isEmpty() && !row.stream()
.allMatch(cell -> cell.getTextBlocks().isEmpty())) {
rowsOfCells.add(row);
}
prevY = y;
}
Collections.reverse(rowsOfCells);
// now cells are removed which are part of a column without any text blocks
// this is done by first computing the inverse matrix which contains call columns of cells
// then the column indices that have to be removed are determined
List<List<Cell>> columnsOfCells = new ArrayList<>();
int maxRowLength = rowsOfCells.stream()
.map(List::size)
.max(java.util.Comparator.naturalOrder())
.orElse(0);
for (int i = 0; i < maxRowLength; i++) {
columnsOfCells.add(new ArrayList<>());
}
for (List<Cell> row : rowsOfCells) {
for (int j = 0; j < row.size(); j++) {
columnsOfCells.get(j).add(row.get(j));
}
}
List<Integer> columnIndicesToRemove = new ArrayList<>();
int columnIndex = 0;
for (List<Cell> col : columnsOfCells) {
if (col.stream()
.allMatch(cell -> cell.getTextBlocks().isEmpty())) {
columnIndicesToRemove.add(columnIndex);
}
columnIndex++;
}
columnIndicesToRemove.sort(Collections.reverseOrder());
// update all rows so that the values of the empty columns get removed
var rowsOfCellsBefore = new ArrayList<>(rowsOfCells);
rowsOfCells = new ArrayList<>();
for (List<Cell> row : rowsOfCellsBefore) {
var updatedRow = new ArrayList<>(row);
columnIndicesToRemove.forEach(idxToRemove -> updatedRow.remove(updatedRow.get(idxToRemove)));
rowsOfCells.add(updatedRow);
}
return rowsOfCells;
}
private void addCellToRowAndCol(Cell cell, int row, int col) {
unrotatedRowCount = Math.max(unrotatedRowCount, row + 1);
unrotatedColCount = Math.max(unrotatedColCount, col + 1);
CellPosition cp = new CellPosition(row, col);
cellTreeMap.put(cp, cell);
public List<Cell> getCells() {
return getRows().stream()
.flatMap(List::stream)
.collect(Collectors.toList());
}
@ -360,7 +108,7 @@ public class TablePageBlock extends AbstractPageBlock {
}
if (column != null && column.getTextBlocks() != null) {
boolean first = true;
for (TextPageBlock textBlock : column.getTextBlocks()) {
for (AbstractPageBlock textBlock : column.getTextBlocks()) {
if (!first) {
sb.append("\n");
}
@ -392,7 +140,7 @@ public class TablePageBlock extends AbstractPageBlock {
sb.append(i == 0 ? "\n<th>" : "\n<td>");
if (column != null && column.getTextBlocks() != null) {
boolean first = true;
for (TextPageBlock textBlock : column.getTextBlocks()) {
for (AbstractPageBlock textBlock : column.getTextBlocks()) {
if (!first) {
sb.append("<br />");
}
@ -411,9 +159,4 @@ public class TablePageBlock extends AbstractPageBlock {
return sb.toString();
}
record CellWithIntersection(Cell originalCell, double intersectedArea) {
}
}

View File

@ -1,9 +1,7 @@
package com.knecon.fforesight.service.layoutparser.processor.model;
package com.knecon.fforesight.service.layoutparser.processor.model.text;
import java.util.List;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
import lombok.AllArgsConstructor;
import lombok.Data;

View File

@ -1,9 +1,7 @@
package com.knecon.fforesight.service.layoutparser.processor.model;
package com.knecon.fforesight.service.layoutparser.processor.model.text;
import java.util.List;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
import lombok.AllArgsConstructor;
import lombok.Data;

View File

@ -7,8 +7,7 @@ import org.apache.pdfbox.text.TextPosition;
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.TextBoundingBox;
import com.knecon.fforesight.service.layoutparser.processor.docstrum.utils.FastAtan2;
import com.knecon.fforesight.service.layoutparser.processor.utils.CoordinateTransforms;
import com.knecon.fforesight.service.layoutparser.processor.utils.PageInformation;
import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations;
import lombok.AccessLevel;
import lombok.AllArgsConstructor;
@ -65,7 +64,7 @@ public class RedTextPosition extends TextBoundingBox {
pos.setBBoxDirAdj(dirAdjPosition);
AffineTransform affineTransform = getRotationMatrix(TextDirection.fromDegrees(textPosition.getDir()), textPosition.getPageWidth(), textPosition.getPageHeight());
Rectangle2D bBoxInitialUserSpace = affineTransform.createTransformedShape(dirAdjPosition).getBounds2D();
Rectangle2D bBoxInitialUserSpace = RectangleTransformations.transform(dirAdjPosition, affineTransform);
pos.setBBoxPdf(bBoxInitialUserSpace); // These are definitely correct

View File

@ -2,47 +2,62 @@ package com.knecon.fforesight.service.layoutparser.processor.model.text;
import java.awt.geom.Rectangle2D;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.List;
import java.util.Set;
import java.util.stream.Collectors;
import com.fasterxml.jackson.annotation.JsonIgnore;
import com.iqser.red.service.redaction.v1.server.model.document.nodes.LayoutEngine;
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.Orientation;
import com.knecon.fforesight.service.layoutparser.processor.model.PageBlockType;
import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations;
import com.knecon.fforesight.service.layoutparser.processor.utils.TextNormalizationUtilities;
import lombok.AllArgsConstructor;
import lombok.Builder;
import lombok.Data;
import lombok.EqualsAndHashCode;
import lombok.NoArgsConstructor;
@EqualsAndHashCode(callSuper = true)
@Data
@AllArgsConstructor
@Builder
@NoArgsConstructor
public class TextPageBlock extends AbstractPageBlock {
@Builder.Default
@EqualsAndHashCode.Exclude
private List<Word> words = new ArrayList<>();
@Builder.Default
@EqualsAndHashCode.Exclude
private FrequencyCounters frequencyCounters = new FrequencyCounters();
private Rectangle2D bBoxDirAdj;
private boolean underlined;
private PageBlockType classification;
private boolean toDuplicate;
@EqualsAndHashCode.Exclude
private String text;
private boolean changed;
public TextPageBlock(List<Word> words, int page, PageBlockType classification, Set<LayoutEngine> engines, Orientation orientation) {
this.page = page;
this.classification = classification;
this.engines = engines;
this.orientation = orientation;
setDefaultFields(words);
}
public TextPageBlock(List<Word> words) {
setDefaultFields(words);
}
private void setDefaultFields(List<Word> words) {
this.words = new ArrayList<>(words);
this.frequencyCounters = new FrequencyCounters();
@ -73,10 +88,6 @@ public class TextPageBlock extends AbstractPageBlock {
this.bBoxDirAdj = new Rectangle2D.Double();
return;
}
this.bBoxDirAdj = words.stream()
.map(Word::getBBoxDirAdj)
.collect(RectangleTransformations.collectBBox());
setToBBoxOfComponents(words);
}
@ -87,7 +98,7 @@ public class TextPageBlock extends AbstractPageBlock {
}
public static TextPageBlock merge(List<TextPageBlock> textBlocksToMerge) {
public static TextPageBlock merge(Collection<TextPageBlock> textBlocksToMerge) {
if (textBlocksToMerge.isEmpty()) {
throw new IllegalArgumentException("Need to provide at least one TextPageBlock.");
@ -98,14 +109,33 @@ public class TextPageBlock extends AbstractPageBlock {
.count() != 1) {
throw new IllegalArgumentException("Cannot merge textBlocks on different pages.");
}
if (textBlocksToMerge.stream()
.map(AbstractPageBlock::getClassification)
.distinct()
.count() != 1) {
throw new IllegalArgumentException("Cannot merge textBlocks of different types.");
}
if (textBlocksToMerge.stream()
.map(AbstractPageBlock::getDir)
.distinct()
.count() != 1) {
throw new IllegalArgumentException("Cannot merge textBlocks of different directions.");
}
List<Word> sequences = textBlocksToMerge.stream()
.map(TextPageBlock::getWords)
.flatMap(java.util.Collection::stream)
.toList();
sequences = new ArrayList<>(sequences);
.flatMap(Collection::stream)
.collect(Collectors.toList());
return new TextPageBlock(sequences);
TextPageBlock first = textBlocksToMerge.iterator().next();
return new TextPageBlock(sequences,
first.getPage(),
first.getClassification(),
textBlocksToMerge.stream()
.map(AbstractPageBlock::getEngines)
.flatMap(Collection::stream)
.collect(Collectors.toSet()),
Orientation.NONE);
}
@ -172,6 +202,14 @@ public class TextPageBlock extends AbstractPageBlock {
}
public void removeAll(List<Word> words) {
changed = true;
this.words.removeAll(words);
setDefaultFields(this.words);
}
public TextPageBlock copy() {
return new TextPageBlock(new ArrayList<>(words));

View File

@ -15,6 +15,7 @@ import org.apache.pdfbox.text.TextPosition;
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.Character;
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.TextBoundingBox;
import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations;
import lombok.AllArgsConstructor;
import lombok.Builder;
@ -66,9 +67,9 @@ public class Word extends TextBoundingBox implements CharSequence {
}
public Word(List<Character> textPositions, int page) {
public Word(List<Character> characters, int page) {
this.characters = new ArrayList<>(textPositions);
this.characters = new ArrayList<>(characters);
this.page = page;
calculateBBoxAndHashcode();
}
@ -101,12 +102,12 @@ public class Word extends TextBoundingBox implements CharSequence {
@Override
public Word subSequence(int start, int end) {
var textPositionSequence = new Word();
textPositionSequence.characters = characters.subList(start, end);
textPositionSequence.page = page;
textPositionSequence.dir = dir;
textPositionSequence.setToBBoxOfComponents(getTextPositions());
return textPositionSequence;
var word = new Word();
word.characters = characters.subList(start, end);
word.page = page;
word.dir = dir;
word.setToBBoxOfComponents(getTextPositions());
return word;
}
@ -262,7 +263,7 @@ public class Word extends TextBoundingBox implements CharSequence {
public void transform(AffineTransform rotateInstance) {
for (RedTextPosition textPosition : getTextPositions()) {
Rectangle2D exactDirAdjCoordinates = rotateInstance.createTransformedShape(textPosition.getBBoxDirAdj()).getBounds2D();
Rectangle2D exactDirAdjCoordinates = RectangleTransformations.transform(textPosition.getBBoxDirAdj(), rotateInstance);
textPosition.setBBoxDirAdj(exactDirAdjCoordinates);
}
calculateBBoxAndHashcode();

View File

@ -13,7 +13,10 @@ import com.iqser.red.service.redaction.v1.server.model.document.nodes.ImageType;
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
import com.knecon.fforesight.service.layoutparser.processor.model.image.ClassifiedImage;
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.image.ImageMetadata;
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.image.ImageServiceResponse;
import com.knecon.fforesight.service.ocr.v1.api.model.Figure;
import com.knecon.fforesight.service.ocr.v1.api.model.IdpResult;
import lombok.RequiredArgsConstructor;
@ -21,48 +24,78 @@ import lombok.RequiredArgsConstructor;
@RequiredArgsConstructor
public class ImageServiceResponseAdapter {
public Map<Integer, List<ClassifiedImage>> buildClassifiedImagesPerPage(ImageServiceResponse imageServiceResponse) {
public Map<Integer, List<ClassifiedImage>> buildClassifiedImagesPerPage(ImageServiceResponse imageServiceResponse, IdpResult idpResult) {
Map<Integer, List<ClassifiedImage>> images = new HashMap<>();
imageServiceResponse.getData().forEach(imageMetadata -> {
var classification = imageMetadata.getFilters().isAllPassed() ? ImageType.valueOf(imageMetadata.getClassification()
.getLabel()
.toUpperCase(Locale.ROOT)) : ImageType.OTHER;
images.computeIfAbsent(imageMetadata.getPosition().getPageNumber(), x -> new ArrayList<>())
.add(new ClassifiedImage(new Rectangle2D.Double(imageMetadata.getPosition().getX1(),
imageMetadata.getPosition().getY1(),
imageMetadata.getGeometry().getWidth(),
imageMetadata.getGeometry().getHeight()), classification, imageMetadata.isAlpha(), imageMetadata.getPosition().getPageNumber(),imageMetadata.getRepresentation()));
});
// Currently This is a copy but, it will be changed later because i don' t think that we should unclassified images.
imageServiceResponse.getDataCV().forEach(imageMetadata -> {
var classification = imageMetadata.getFilters().isAllPassed() ? ImageType.valueOf(imageMetadata.getClassification()
.getLabel()
.toUpperCase(Locale.ROOT)) : ImageType.OTHER;
images.computeIfAbsent(imageMetadata.getPosition().getPageNumber(), x -> new ArrayList<>())
.add(new ClassifiedImage(new Rectangle2D.Double(imageMetadata.getPosition().getX1(),
imageMetadata.getPosition().getY1(),
imageMetadata.getGeometry().getWidth(),
imageMetadata.getGeometry().getHeight()), classification, imageMetadata.isAlpha(), imageMetadata.getPosition().getPageNumber(),imageMetadata.getRepresentation()));
});
imageServiceResponse.getData()
.forEach(imageMetadata -> addImageMetaData(imageMetadata, images));
imageServiceResponse.getDataCV()
.forEach(imageMetadata -> addImageMetaData(imageMetadata, images));
idpResult.figures()
.forEach(figure -> addFigure(figure, images));
return images;
}
private static void addFigure(Figure figure, Map<Integer, List<ClassifiedImage>> images) {
var classification = ImageType.GRAPHIC;
ClassifiedImage image = new ClassifiedImage(figure.image().bbox().get().getBounds2D(), classification, false, figure.image().pageNumber(), "");
getImagesOnPage(figure.image().pageNumber(), images).add(image);
}
private static void addImageMetaData(ImageMetadata imageMetadata, Map<Integer, List<ClassifiedImage>> images) {
var image = new ClassifiedImage(getPosition(imageMetadata),
getImageType(imageMetadata),
imageMetadata.isAlpha(),
imageMetadata.getPosition().getPageNumber(),
imageMetadata.getRepresentation());
getImagesOnPage(imageMetadata.getPosition().getPageNumber(), images).add(image);
}
private static Rectangle2D.Double getPosition(ImageMetadata imageMetadata) {
return new Rectangle2D.Double(imageMetadata.getPosition().getX1(),
imageMetadata.getPosition().getY1(),
imageMetadata.getGeometry().getWidth(),
imageMetadata.getGeometry().getHeight());
}
private static ImageType getImageType(ImageMetadata imageMetadata) {
if (imageMetadata.getFilters().isAllPassed()) {
return ImageType.valueOf(imageMetadata.getClassification().getLabel().toUpperCase(Locale.ROOT));
} else {
return ImageType.OTHER;
}
}
private static List<ClassifiedImage> getImagesOnPage(int pageNumber, Map<Integer, List<ClassifiedImage>> images) {
return images.computeIfAbsent(pageNumber, x -> new ArrayList<>());
}
public void findOcr(ClassificationPage classificationPage) {
classificationPage.getImages().forEach(image -> {
if (image.getImageType().equals(ImageType.OTHER)) {
for (AbstractPageBlock textblock : classificationPage.getTextBlocks()) {
if (image.getPosition().contains(textblock.getBBoxPdf())) {
image.setImageType(ImageType.OCR);
return;
classificationPage.getImages()
.forEach(image -> {
if (image.getImageType().equals(ImageType.OTHER)) {
for (AbstractPageBlock textblock : classificationPage.getTextBlocks()) {
if (image.getPosition().contains(textblock.getBBoxPdf())) {
image.setImageType(ImageType.OCR);
return;
}
}
}
}
}
});
});
}
}

View File

@ -183,7 +183,7 @@ public class BodyTextFrameService {
if (cell == null || cell.getTextBlocks() == null) {
continue;
}
for (TextPageBlock textBlock : cell.getTextBlocks()) {
for (AbstractPageBlock textBlock : cell.getTextBlocks()) {
expandRectangle(textBlock, page, expansionsRectangle);
}
}
@ -198,7 +198,7 @@ public class BodyTextFrameService {
}
private void expandRectangle(TextPageBlock textBlock, ClassificationPage page, BodyTextFrameExpansionsRectangle expansionsRectangle) {
private void expandRectangle(AbstractPageBlock textBlock, ClassificationPage page, BodyTextFrameExpansionsRectangle expansionsRectangle) {
if (page.getPageWidth() > page.getPageHeight() && page.getRotation() != 0) {
if (textBlock.getPdfMinY() < expansionsRectangle.minX) {

View File

@ -1,25 +0,0 @@
package com.knecon.fforesight.service.layoutparser.processor.services;
import java.awt.geom.Rectangle2D;
import com.knecon.fforesight.service.layoutparser.processor.model.LineInformation;
import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations;
import lombok.experimental.UtilityClass;
@UtilityClass
public class MainBodyTextFrameExtractionService {
private static final double TEXT_FRAME_PAD_WIDTH = 0.0;
private static final double TEXT_FRAME_PAD_HEIGHT = 0.02;
public Rectangle2D calculateMainBodyTextFrame(LineInformation lineInformation) {
Rectangle2D mainBodyTextFrame = lineInformation.getLineBBox().stream()
.collect(RectangleTransformations.collectBBox());
return RectangleTransformations.pad(mainBodyTextFrame, mainBodyTextFrame.getWidth() * TEXT_FRAME_PAD_WIDTH, mainBodyTextFrame.getHeight() * TEXT_FRAME_PAD_HEIGHT);
}
}

View File

@ -1,73 +1,207 @@
package com.knecon.fforesight.service.layoutparser.processor.services;
import java.awt.geom.Rectangle2D;
import java.io.File;
import java.io.IOException;
import java.util.Collection;
import java.util.LinkedList;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.concurrent.CountDownLatch;
import java.util.stream.Collectors;
import org.apache.pdfbox.Loader;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
import org.springframework.core.io.ClassPathResource;
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.BoundingBox;
import com.knecon.fforesight.service.layoutparser.processor.model.PageContents;
import com.knecon.fforesight.service.layoutparser.processor.model.PageInformation;
import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling;
import com.knecon.fforesight.service.layoutparser.processor.model.text.Word;
import com.knecon.fforesight.service.layoutparser.processor.services.graphics.Box;
import com.knecon.fforesight.service.layoutparser.processor.services.graphics.FindGraphicsRaster;
import com.knecon.fforesight.service.layoutparser.processor.services.graphics.GraphicBBDetector;
import com.knecon.fforesight.service.layoutparser.processor.services.parsing.PDFLinesTextStripper;
import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations;
import lombok.experimental.UtilityClass;
import lombok.AccessLevel;
import lombok.Getter;
import lombok.SneakyThrows;
import lombok.experimental.FieldDefaults;
@UtilityClass
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
public class PageContentExtractor {
public List<PageContents> getSortedPageContents(String filename) throws IOException {
static boolean USE_IMAGE_BASED_GRAPHIC_DETECTION;
@Getter
int pageCount;
@Getter
File document;
List<PageContents> textPositionSequencesPerPage = new LinkedList<>();
ClassPathResource pdfResource = new ClassPathResource(filename);
PageContents[] pageContents;
CountDownLatch[] finishedLookup;
List<List<Integer>> pageNumberBatches;
try (PDDocument pdDocument = Loader.loadPDF(pdfResource.getFile())) {
for (int pageNumber = 1; pageNumber < pdDocument.getNumberOfPages() + 1; pageNumber++) {
public PageContentExtractor(File document, int threads) {
PDFLinesTextStripper stripper = new PDFLinesTextStripper();
PDPage pdPage = pdDocument.getPage(pageNumber - 1);
stripper.setPageNumber(pageNumber);
stripper.setSortByPosition(true);
stripper.setStartPage(pageNumber);
stripper.setEndPage(pageNumber);
stripper.setPdpage(pdPage);
stripper.getText(pdDocument);
Map<Float, List<Word>> sortedTextPositionSequencesPerDir = stripper.getWords()
.stream()
.collect(Collectors.groupingBy(textPositionSequence -> textPositionSequence.getDir().getDegrees()));
var sortedTextPositionSequences = sortByDirAccordingToPageRotation(sortedTextPositionSequencesPerDir, pdPage.getRotation());
textPositionSequencesPerPage.add(new PageContents(sortedTextPositionSequences,
RectangleTransformations.toRectangle2D(pdPage.getCropBox()),
RectangleTransformations.toRectangle2D(pdPage.getMediaBox()),
stripper.getRulings()));
}
this.document = document;
this.pageCount = getPageCount(document);
this.pageContents = new PageContents[pageCount];
this.finishedLookup = new CountDownLatch[pageCount];
for (int i = 0; i < pageCount; i++) {
this.finishedLookup[i] = new CountDownLatch(1);
}
int actualThreads = Math.min(pageCount, threads);
pageNumberBatches = new ArrayList<>(actualThreads);
for (int i = 0; i < actualThreads; i++) {
pageNumberBatches.add(new ArrayList<>(pageCount / actualThreads));
}
for (int i = 1; i <= pageCount; i++) {
pageNumberBatches.get(i % actualThreads).add(i);
}
return textPositionSequencesPerPage;
}
public List<Word> sortByDirAccordingToPageRotation(Map<Float, List<Word>> sortedTextPositionSequencesPerDir, int rotation) {
@SneakyThrows
private int getPageCount(File document) {
LinkedList<Float> sortedKeys = new LinkedList<>(sortedTextPositionSequencesPerDir.keySet().stream().sorted().toList());
for (int i = 0; i < sortedKeys.size(); i++) {
if (sortedKeys.get(i) < rotation) {
Float keyToSwap = sortedKeys.remove(i);
sortedKeys.addLast(keyToSwap);
}
try (var doc = openDocument(document)) {
return doc.getNumberOfPages();
}
return sortedKeys.stream().map(sortedTextPositionSequencesPerDir::get).flatMap(Collection::stream).toList();
}
@SneakyThrows
public void startAsync() {
for (List<Integer> pageNumberBatch : pageNumberBatches) {
Thread thread = new Thread(() -> extractPages(pageNumberBatch));
thread.start();
}
}
@SneakyThrows
private void extractPages(List<Integer> pageNumbers) {
var doc = openDocument(document);
int count = 0;
var pageGetter = new PageGetter(doc.getPages()
.iterator(), pageCount);
for (Integer pageNumber : pageNumbers) {
count++;
if (count % 100 == 0) {
// As PDFBox caches all types of stuff, we need to close the document every once in a while to save on RAM
doc.close();
doc = openDocument(document);
}
extractPage(pageNumber, doc, pageGetter.getPage(pageNumber));
}
doc.close();
}
@SneakyThrows
private PDDocument openDocument(File originFile) {
PDDocument document = Loader.loadPDF(originFile);
document.setAllSecurityToBeRemoved(true);
return document;
}
@SneakyThrows
public void extractPage(Integer pageNumber, PDDocument doc, PDPage pdPage) {
PDFLinesTextStripper stripper = new PDFLinesTextStripper();
stripper.setPageNumber(pageNumber);
stripper.setStartPage(pageNumber);
stripper.setEndPage(pageNumber);
stripper.setPdpage(pdPage);
stripper.getText(doc);
PageInformation pageInformation = PageInformation.fromPDPage(pageNumber, pdPage);
List<Word> words = stripper.getWords();
List<Ruling> rulings = stripper.getRulings();
List<Box> graphicBBoxes = findGraphicBBoxes(pageInformation, pdPage, doc, words);
pageContents[pageNumber - 1] = new PageContents(pageInformation, words, rulings, graphicBBoxes);
finishedLookup[pageNumber - 1].countDown();
}
private static List<Box> findGraphicBBoxes(PageInformation pageInformation, PDPage pdPage, PDDocument doc, List<Word> words) throws IOException {
GraphicBBDetector graphicBBDetector = new GraphicBBDetector(pdPage, true);
List<Box> graphicBBoxes = graphicBBDetector.findGraphicBB();
if (USE_IMAGE_BASED_GRAPHIC_DETECTION) {
// This should only be used if ocr was performed, it is currently in an early stage and needs to be improved.
List<Rectangle2D> wordIgnoreZones = words.stream()
.map(BoundingBox::getBBoxPdf)
.map(box -> RectangleTransformations.pad(box, 2, 2))
.collect(Collectors.toList());
graphicBBoxes.addAll(FindGraphicsRaster.findCCBoundingBoxes(doc, wordIgnoreZones, pageInformation));
}
return graphicBBoxes;
}
public PageContents awaitPageContents(Integer pageNumber) throws InterruptedException {
finishedLookup[pageNumber - 1].await();
return pageContents[pageNumber - 1];
}
public List<PageContents> awaitAllContents() throws InterruptedException {
for (CountDownLatch countDownLatch : finishedLookup) {
countDownLatch.await();
}
return Arrays.asList(pageContents);
}
@SneakyThrows
public static List<PageContents> getDocumentContents(File document, int threads) {
PageContentExtractor extractor = new PageContentExtractor(document, threads);
extractor.startAsync();
return extractor.awaitAllContents();
}
private static class PageGetter {
Iterator<PDPage> pageIterator;
int current;
int max;
PageGetter(Iterator<PDPage> pageIterator, int max) {
this.pageIterator = pageIterator;
this.max = max;
this.current = 0;
}
public PDPage getPage(int pageNumber) {
assert pageNumber >= current && pageNumber <= max;
int pagesToIterate = pageNumber - current;
PDPage page = null;
for (int i = 0; i < pagesToIterate; i++) {
page = pageIterator.next();
}
current = pageNumber;
return page;
}
}
}

View File

@ -1,24 +0,0 @@
package com.knecon.fforesight.service.layoutparser.processor.services;
import java.awt.geom.Rectangle2D;
import com.knecon.fforesight.service.layoutparser.processor.model.GapInformation;
import com.knecon.fforesight.service.layoutparser.processor.model.LineInformation;
import com.knecon.fforesight.service.layoutparser.processor.model.PageContents;
import com.knecon.fforesight.service.layoutparser.processor.model.PageInformation;
import lombok.experimental.UtilityClass;
@UtilityClass
public class PageInformationService {
public PageInformation build(PageContents pageContents) {
LineInformation lineInformation = LineDetectionService.calculateLineInformation(pageContents.getSortedWords());
Rectangle2D mainBodyTextFrame = MainBodyTextFrameExtractionService.calculateMainBodyTextFrame(lineInformation);
GapInformation gapInformation = GapDetectionService.findGapsInLines(pageContents.getSortedWords(), mainBodyTextFrame);
return new PageInformation(pageContents, lineInformation, mainBodyTextFrame, gapInformation);
}
}

View File

@ -5,18 +5,20 @@ import static com.knecon.fforesight.service.layoutparser.processor.utils.Geometr
import java.awt.geom.Point2D;
import java.awt.geom.Rectangle2D;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Collection;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Objects;
import java.util.Set;
import java.util.stream.Collectors;
import org.springframework.stereotype.Service;
import com.knecon.fforesight.service.layoutparser.processor.model.UnionFind;
import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings;
import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling;
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableCells;
import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations;
import com.knecon.fforesight.service.layoutparser.processor.utils.UnionFind;
import lombok.RequiredArgsConstructor;
import lombok.extern.slf4j.Slf4j;
@ -52,22 +54,22 @@ public class RulingCleaningService {
private Rulings cleanRulings(Rulings rulings) {
List<List<Rectangle2D>> groupedOverlappingVerticalRectangles = groupOverlappingRectangles(rulings.verticalLines.stream()
.map(RulingCleaningService::getOverlapRectangle)
.distinct()
.toList());
var groupedOverlappingVerticalRectangles = groupOverlappingRectangles(rulings.verticalLines.stream()
.map(RulingCleaningService::getOverlapRectangle)
.distinct()
.toList());
List<Ruling> cleanedVerticalRulings = groupedOverlappingVerticalRectangles.stream()
.map(rectList -> getXCenteredRuling(RectangleTransformations.rectangle2DBBox(rectList)))
.map(RulingCleaningService::getXCenteredRuling)
.filter(ruling -> ruling.length() > 0)
.toList();
List<List<Rectangle2D>> groupedOverlappingHorizontalRectangles = groupOverlappingRectangles(rulings.horizontalLines.stream()
.map(RulingCleaningService::getOverlapRectangle)
.distinct()
.toList());
var groupedOverlappingHorizontalRectangles = groupOverlappingRectangles(rulings.horizontalLines.stream()
.map(RulingCleaningService::getOverlapRectangle)
.distinct()
.toList());
List<Ruling> cleanedHorizontalRulings = groupedOverlappingHorizontalRectangles.stream()
.map(rectList -> getYCenteredRuling(RectangleTransformations.rectangle2DBBox(rectList)))
.map(RulingCleaningService::getYCenteredRuling)
.filter(ruling -> ruling.length() > 0)
.collect(Collectors.toList());
@ -75,13 +77,40 @@ public class RulingCleaningService {
}
private List<List<Rectangle2D>> groupOverlappingRectangles(List<Rectangle2D> rectangles) {
private static Ruling getXCenteredRuling(Set<OverlapRectangle> rectList) {
UnionFind<Rectangle2D> unionFind = new UnionFind<>();
Ruling ruling = getXCenteredRuling(rectList.stream()
.map(OverlapRectangle::rectangle2D)
.collect(RectangleTransformations.collectBBox()));
ruling.setStyle(rectList.iterator().next().style);
return ruling;
}
private static Ruling getYCenteredRuling(Set<OverlapRectangle> rectList) {
Ruling ruling = getYCenteredRuling(rectList.stream()
.map(OverlapRectangle::rectangle2D)
.collect(RectangleTransformations.collectBBox()));
ruling.setStyle(rectList.iterator().next().style);
return ruling;
}
private Collection<Set<OverlapRectangle>> groupOverlappingRectangles(List<OverlapRectangle> rectangles) {
UnionFind<OverlapRectangle> unionFind = new UnionFind<>(new HashSet<>(rectangles));
for (int i = 0; i < rectangles.size(); i++) {
for (int j = i + 1; j < rectangles.size(); j++) {
Rectangle2D rectangle1 = rectangles.get(i);
Rectangle2D rectangle2 = rectangles.get(j);
OverlapRectangle overlapRectangle1 = rectangles.get(i);
OverlapRectangle overlapRectangle2 = rectangles.get(j);
if (!Objects.equals(overlapRectangle1.style, overlapRectangle2.style)) {
continue;
}
Rectangle2D rectangle1 = overlapRectangle1.rectangle2D;
Rectangle2D rectangle2 = overlapRectangle2.rectangle2D;
// we can stop early when we are too far off because of x-y-sorting
if (rectangle1.getMaxX() < rectangle2.getMinX() && rectangle1.getMaxY() < rectangle2.getMinY()) {
@ -89,21 +118,16 @@ public class RulingCleaningService {
}
if (rectangle1.intersects(rectangle2)) {
unionFind.union(rectangle1, rectangle2);
unionFind.union(overlapRectangle1, overlapRectangle2);
}
}
}
Map<Rectangle2D, List<Rectangle2D>> groups = new HashMap<>();
for (Rectangle2D rectangle : rectangles) {
Rectangle2D root = unionFind.find(rectangle);
groups.computeIfAbsent(root, k -> new ArrayList<>()).add(rectangle);
}
return new ArrayList<>(groups.values());
return unionFind.getGroups();
}
private static Rectangle2D getOverlapRectangle(Ruling ruling) {
private static OverlapRectangle getOverlapRectangle(Ruling ruling) {
float y;
float x;
@ -124,12 +148,14 @@ public class RulingCleaningService {
y = ruling.y2;
h = ruling.y1 - ruling.y2;
}
Rectangle2D overlapRectangle;
if (ruling.isHorizontal()) {
return new Rectangle2D.Double(x - THRESHOLD_X_HORIZONTAL, y - THRESHOLD_Y_HORIZONTAL, w + 2 * THRESHOLD_X_HORIZONTAL, h + 2 * THRESHOLD_Y_HORIZONTAL);
overlapRectangle = new Rectangle2D.Double(x - THRESHOLD_X_HORIZONTAL, y - THRESHOLD_Y_HORIZONTAL, w + 2 * THRESHOLD_X_HORIZONTAL, h + 2 * THRESHOLD_Y_HORIZONTAL);
} else {
return new Rectangle2D.Double(x - THRESHOLD_X_VERTICAL, y - THRESHOLD_Y_VERTICAL, w + 2 * THRESHOLD_X_VERTICAL, h + 2 * THRESHOLD_Y_VERTICAL);
overlapRectangle = new Rectangle2D.Double(x - THRESHOLD_X_VERTICAL, y - THRESHOLD_Y_VERTICAL, w + 2 * THRESHOLD_X_VERTICAL, h + 2 * THRESHOLD_Y_VERTICAL);
}
return new OverlapRectangle(overlapRectangle, ruling.getStyle());
}
@ -243,4 +269,8 @@ public class RulingCleaningService {
}
private record OverlapRectangle(Rectangle2D rectangle2D, Ruling.Style style) {
}
}

View File

@ -12,8 +12,8 @@ import org.springframework.stereotype.Service;
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationDocument;
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationFooter;
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationHeader;
import com.knecon.fforesight.service.layoutparser.processor.model.text.ClassificationFooter;
import com.knecon.fforesight.service.layoutparser.processor.model.text.ClassificationHeader;
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationSection;
import com.knecon.fforesight.service.layoutparser.processor.model.PageBlockType;
@ -30,7 +30,6 @@ import lombok.extern.slf4j.Slf4j;
@Deprecated
public class SectionsBuilderService {
public void buildSections(ClassificationDocument document) {
List<AbstractPageBlock> chunkWords = new ArrayList<>();
@ -73,8 +72,7 @@ public class SectionsBuilderService {
chunkBlockList.add(chunkBlock);
chunkWords = new ArrayList<>();
if (!chunkBlock.getTables().isEmpty()) {
previousTable = chunkBlock.getTables()
.get(chunkBlock.getTables().size() - 1);
previousTable = chunkBlock.getTables().get(chunkBlock.getTables().size() - 1);
}
}
if (current instanceof TablePageBlock table) {
@ -236,12 +234,8 @@ public class SectionsBuilderService {
List<Cell> previousTableNonHeaderRow = getRowWithNonHeaderCells(previousTable);
List<Cell> tableNonHeaderRow = getRowWithNonHeaderCells(currentTable);
// Allow merging of tables if header row is separated from first logical non-header row
if (previousTableNonHeaderRow.isEmpty()
&& previousTable.getRowCount() == 1
&& previousTable.getRows()
.get(0).size() == tableNonHeaderRow.size()) {
previousTableNonHeaderRow = previousTable.getRows()
.get(0)
if (previousTableNonHeaderRow.isEmpty() && previousTable.getRowCount() == 1 && previousTable.getRows().get(0).size() == tableNonHeaderRow.size()) {
previousTableNonHeaderRow = previousTable.getRows().get(0)
.stream()
.map(cell -> {
Cell fakeCell = Cell.copy(cell);
@ -252,8 +246,7 @@ public class SectionsBuilderService {
}
if (previousTableNonHeaderRow.size() == tableNonHeaderRow.size()) {
for (int i = currentTable.getRowCount() - 1; i >= 0; i--) { // Non header rows are most likely at bottom of table
List<Cell> row = currentTable.getRows()
.get(i);
List<Cell> row = currentTable.getRows().get(i);
if (row.size() == tableNonHeaderRow.size() && row.stream()
.allMatch(cell -> cell.getHeaderCells().isEmpty())) {
for (int j = 0; j < row.size(); j++) {
@ -272,13 +265,6 @@ public class SectionsBuilderService {
for (AbstractPageBlock container : wordBlockList) {
if (container instanceof TablePageBlock table) {
if (lastHeadline == null || lastHeadline.isEmpty()) {
table.setHeadline("Text in table");
} else {
table.setHeadline("TablePageBlock in: " + lastHeadline);
}
section.getPageBlocks().add(table);
continue;
}
@ -310,8 +296,7 @@ public class SectionsBuilderService {
private List<Cell> getRowWithNonHeaderCells(TablePageBlock table) {
for (int i = table.getRowCount() - 1; i >= 0; i--) { // Non header rows are most likely at bottom of table
List<Cell> row = table.getRows()
.get(i);
List<Cell> row = table.getRows().get(i);
if (row.size() == 1) {
continue;
}

View File

@ -1,159 +0,0 @@
package com.knecon.fforesight.service.layoutparser.processor.services;
import static com.knecon.fforesight.service.layoutparser.processor.utils.GeometricComparators.CELL_SIZE_COMPARATOR;
import static com.knecon.fforesight.service.layoutparser.processor.utils.GeometricComparators.RECTANGLE_SIZE_COMPARATOR;
import java.awt.geom.AffineTransform;
import java.awt.geom.Rectangle2D;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.stream.Collectors;
import org.springframework.stereotype.Service;
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.BoundingBox;
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
import com.knecon.fforesight.service.layoutparser.processor.model.table.Cell;
import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling;
import com.knecon.fforesight.service.layoutparser.processor.model.table.TablePageBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.text.RedTextPosition;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.utils.CoordinateTransforms;
import com.knecon.fforesight.service.layoutparser.processor.utils.DoubleComparisons;
import com.knecon.fforesight.service.layoutparser.processor.utils.PageInformation;
import com.knecon.fforesight.service.layoutparser.processor.utils.RectangularIntersectionFinder;
import com.knecon.fforesight.service.layoutparser.processor.utils.SpreadsheetFinder;
import lombok.SneakyThrows;
@Service
public class TableExtractionService {
private static final int MAX_TABLE_CONTAINED_CELLS_WITH_TEXT = 1;
private static final double TABLE_UNIFORMITY_THRESHOLD = 0.7;
/**
* Finds tables on a page and moves textblocks into cells of the found tables.
* Note: This algorithm uses Pdf Coordinate System where {0,0} rotated with the page rotation.
* 0 -> LowerLeft
* 90 -> UpperLeft
* 180 -> UpperRight
* 270 -> LowerRight
* <p>
* DirAdj (Text direction adjusted) values can not be used here.
*
* @param emptyCells The cells used to build the table.
* @param page Page object that contains textblocks and statistics.
*/
public void extractTables(List<Cell> emptyCells, ClassificationPage page) {
// sort cells by size (height * width) ascending so that textBlocks are always assigned to the smallest cells that contain them
emptyCells.sort(CELL_SIZE_COMPARATOR);
for (AbstractPageBlock abstractPageBlock : page.getTextBlocks()) {
TextPageBlock textBlock = (TextPageBlock) abstractPageBlock;
for (Cell cell : emptyCells) {
if (cell.hasMinimumSize() && doesCellContainTextBlock(cell, textBlock)) {
cell.addTextBlock(textBlock);
break;
}
}
}
List<Cell> cells = new ArrayList<>(new HashSet<>(emptyCells));
DoubleComparisons.sort(cells, BoundingBox.ILL_DEFINED_ORDER);
List<Rectangle2D> spreadsheetAreas = SpreadsheetFinder.findSpreadsheetsFromCells(cells);
// sort spreadsheetAreas by size (height * width) ascending so that cells are placed in the smallest tables first
// this way no cell duplication occurs when tables are contained in other tables and only the most inner table contains the cells
spreadsheetAreas.sort(RECTANGLE_SIZE_COMPARATOR);
List<TablePageBlock> tables = new ArrayList<>();
for (Rectangle2D area : spreadsheetAreas) {
List<Cell> containedCells = new ArrayList<>();
for (Cell c : cells) {
if (c.hasMinimumSize() && area.contains(c.getBBoxPdf())) {
containedCells.add(c);
}
}
var containedCellsWithText = containedCells.stream()
.filter(cell -> !cell.getTextBlocks().isEmpty())
.toList();
// verify if table would contain fewer cells with text than the threshold allows
if (containedCellsWithText.size() >= MAX_TABLE_CONTAINED_CELLS_WITH_TEXT && checkIfTableCellsAreUniform(containedCells)) {
tables.add(new TablePageBlock(containedCells, page.getRotation()));
cells.removeAll(containedCells);
}
}
for (TablePageBlock table : tables) {
int position = -1;
for (AbstractPageBlock pageBlock : page.getTextBlocks()) {
if (pageBlock instanceof TextPageBlock ? table.contains(pageBlock) : table.contains(pageBlock) && position == -1) {
position = page.getTextBlocks().indexOf(pageBlock);
}
}
if (position != -1) {
page.getTextBlocks().add(position, table);
var toBeRemoved = table.getCells()
.stream()
.map(Cell::getTextBlocks)
.flatMap(List::stream)
.toList();
// remove text blocks from the page that were also added with the table (from its contained cells)
page.getTextBlocks().removeAll(toBeRemoved);
}
}
}
private boolean checkIfTableCellsAreUniform(List<Cell> containedCells) {
if (containedCells.size() <= 2) {
return true;
}
Map<Long, List<Long>> cellsGroupedByRoundedWidth = containedCells.stream()
.map(BoundingBox::getWidth)
.map(size -> Math.round(size / 10.0) * 10)
.collect(Collectors.groupingBy(Long::longValue));
return (double) cellsGroupedByRoundedWidth.size() / containedCells.size() <= TABLE_UNIFORMITY_THRESHOLD;
}
private boolean doesCellContainTextBlock(Cell cell, TextPageBlock textBlock) {
return cell.contains(textBlock, RedTextPosition.HEIGHT_PADDING);
}
@SneakyThrows
public static List<Cell> findCells(List<Ruling> horizontalRulingLines, List<Ruling> verticalRulingLines, PageInformation pageInformation) {
AffineTransform affineTransform = CoordinateTransforms.calculateInitialUserSpaceCoordsToImageCoords(pageInformation, 1);
/*
switch (pageInformation.rotationDegrees()) {
case 90 -> affineTransform.translate(RedTextPosition.HEIGHT_PADDING, 0); //although this is wrong, our text coordinates are wrong as well
case 180 -> affineTransform.translate(0, RedTextPosition.HEIGHT_PADDING);
case 270 -> affineTransform.translate(-RedTextPosition.HEIGHT_PADDING, 0);
default -> affineTransform.translate(0, -RedTextPosition.HEIGHT_PADDING);
}
*/
return RectangularIntersectionFinder.find(horizontalRulingLines, verticalRulingLines)
.stream()
.map(rect -> new Cell(rect, affineTransform))
.collect(Collectors.toList());
}
}

View File

@ -18,7 +18,7 @@ import com.knecon.fforesight.service.layoutparser.processor.model.SectionIdentif
import com.knecon.fforesight.service.layoutparser.processor.model.outline.OutlineObject;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.text.Word;
import com.knecon.fforesight.service.layoutparser.processor.utils.PageInformation;
import com.knecon.fforesight.service.layoutparser.processor.model.PageInformation;
import com.knecon.fforesight.service.layoutparser.processor.utils.TextNormalizationUtilities;
import lombok.Data;

View File

@ -0,0 +1,42 @@
package com.knecon.fforesight.service.layoutparser.processor.services.blockification;
import java.util.Collections;
import java.util.List;
import org.springframework.stereotype.Service;
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType;
import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.text.Word;
import com.knecon.fforesight.service.layoutparser.processor.visualization.LayoutDebugLayer;
import lombok.AccessLevel;
import lombok.RequiredArgsConstructor;
import lombok.experimental.FieldDefaults;
@Service
@RequiredArgsConstructor
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
public class BlockificationService {
RedactManagerBlockificationService redactManagerBlockificationService;
DocstrumBlockificationService docstrumBlockificationService;
DocuMineBlockificationService docuMineBlockificationService;
public List<TextPageBlock> blockify(LayoutParsingType layoutParsingType, List<Word> words, CleanRulings cleanRulings, LayoutDebugLayer layoutDebugLayer) {
if (words.isEmpty()) {
return Collections.emptyList();
}
return switch (layoutParsingType) {
case REDACT_MANAGER_OLD -> redactManagerBlockificationService.blockify(words, cleanRulings, layoutDebugLayer);
case DOCUMINE_OLD -> docuMineBlockificationService.blockify(words, cleanRulings);
case DOCUMINE, REDACT_MANAGER, REDACT_MANAGER_PARAGRAPH_DEBUG, REDACT_MANAGER_WITHOUT_DUPLICATE_PARAGRAPH ->
docstrumBlockificationService.blockify(words, cleanRulings, true, layoutDebugLayer, layoutParsingType);
case CLARIFYND, CLARIFYND_PARAGRAPH_DEBUG -> docstrumBlockificationService.blockify(words, cleanRulings, false, layoutDebugLayer, layoutParsingType);
};
}
}

View File

@ -10,7 +10,6 @@ import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsi
import com.knecon.fforesight.service.layoutparser.processor.docstrum.DocstrumSegmentationService;
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.Zone;
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings;
import com.knecon.fforesight.service.layoutparser.processor.model.table.TablePageBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
@ -30,46 +29,39 @@ public class DocstrumBlockificationService {
static final float THRESHOLD = 1f;
public ClassificationPage blockify(List<Word> textPositions,
CleanRulings rulings,
boolean xyOrder,
LayoutDebugLayer visualizations,
LayoutParsingType layoutParsingType) {
public List<TextPageBlock> blockify(List<Word> words, CleanRulings rulings, boolean xyOrder, LayoutDebugLayer visualizations, LayoutParsingType layoutParsingType) {
CleanRulings usedRulings = rulings.withoutTextRulings();
CleanRulings rulingsWithoutTextRulings = rulings.withoutTextRulings();
List<Zone> zones = docstrumSegmentationService.segmentPage(textPositions, xyOrder, usedRulings, visualizations);
List<Zone> zones = docstrumSegmentationService.segmentPage(words, xyOrder, rulingsWithoutTextRulings);
if (!textPositions.isEmpty()) {
visualizations.addZoneVisualizations(zones, textPositions.get(0).getPage());
visualizations.addLineVisualizationsFromZones(zones, textPositions.get(0).getPage());
visualizations.addCharactersWithNeighbours(zones, textPositions.get(0).getPage());
if (!words.isEmpty() && visualizations != null) {
visualizations.addZoneVisualizations(zones, words.get(0).getPage());
visualizations.addLineVisualizationsFromZones(zones, words.get(0).getPage());
visualizations.addCharactersWithNeighbours(zones, words.get(0).getPage());
}
var pageBlocks = toAbstractPageBlocks(zones);
var classificationPage = new ClassificationPage(pageBlocks);
classificationPage.setCleanRulings(rulings);
mergeIntersectingBlocks(classificationPage, usedRulings, 0, 0);
mergeIntersectingBlocks(pageBlocks, rulingsWithoutTextRulings, 0, 0);
if (layoutParsingType == LayoutParsingType.DOCUMINE
|| layoutParsingType == LayoutParsingType.REDACT_MANAGER
|| layoutParsingType == LayoutParsingType.REDACT_MANAGER_WITHOUT_DUPLICATE_PARAGRAPH) {
combineBlocks(classificationPage, layoutParsingType);
combineBlocks(pageBlocks, rulings, layoutParsingType);
}
if (layoutParsingType == LayoutParsingType.CLARIFYND) {
mergeIntersectingBlocks(classificationPage, usedRulings, 0, 0);
mergeIntersectingBlocks(pageBlocks, rulingsWithoutTextRulings, 0, 0);
}
return classificationPage;
return pageBlocks;
}
private List<AbstractPageBlock> toAbstractPageBlocks(List<Zone> zones) {
private List<TextPageBlock> toAbstractPageBlocks(List<Zone> zones) {
List<AbstractPageBlock> abstractPageBlocks = new ArrayList<>();
List<TextPageBlock> abstractPageBlocks = new ArrayList<>();
zones.forEach(zone -> {
List<Word> words = new ArrayList<>();
@ -88,29 +80,23 @@ public class DocstrumBlockificationService {
}
public void combineBlocks(ClassificationPage page, LayoutParsingType layoutParsingType) {
public void combineBlocks(List<TextPageBlock> blocks, CleanRulings rulingsWithoutTextRulings, LayoutParsingType layoutParsingType) {
TextPageBlock previous = new TextPageBlock();
ListIterator<AbstractPageBlock> itty = page.getTextBlocks().listIterator();
CleanRulings usedRulings = page.getCleanRulings().withoutTextRulings();
ListIterator<TextPageBlock> itty = blocks.listIterator();
while (itty.hasNext()) {
AbstractPageBlock block = itty.next();
if (block instanceof TablePageBlock) {
previous = new TextPageBlock();
continue;
}
TextPageBlock current = (TextPageBlock) block;
TextPageBlock current = itty.next();
if (previous != null && !previous.getWords().isEmpty()) {
if (current.getDir() != previous.getDir() || usedRulings.lineBetween(current, previous)) {
if (current.getDir() != previous.getDir() || rulingsWithoutTextRulings.lineBetween(current, previous)) {
previous = current;
continue;
}
if (current.isHeadline() || previous.isHeadline()) {
if (intersectsYWithPreviousHavingMaxOneLine(previous, current, page)) {
if (intersectsYWithPreviousHavingMaxOneLine(previous, current)) {
previous = combineBlocksAndResetIterator(previous, current, itty, false);
} else {
previous = current;
@ -119,7 +105,7 @@ public class DocstrumBlockificationService {
continue;
}
if (areTheOnlyTwoBlocksOnHeightsWithBothMoreThanTwoLines(previous, current, page)) {
if (areTheOnlyTwoBlocksOnHeightsWithBothMoreThanTwoLines(previous, current, blocks)) {
// previous = combineBlocksAndResetIterator(previous, current, itty, true);
previous = combineBlocksAndResetIterator(previous, current, itty, layoutParsingType != LayoutParsingType.REDACT_MANAGER_WITHOUT_DUPLICATE_PARAGRAPH);
continue;
@ -130,12 +116,12 @@ public class DocstrumBlockificationService {
continue;
}
if (isSameTopOrBottomWithPreviousHavingMaxTwoLinesAndCurrentThanOneAndMax4OtherBlocksOnHeight(previous, current, page)) {
if (isSameTopOrBottomWithPreviousHavingMaxTwoLinesAndCurrentThanOneAndMax4OtherBlocksOnHeight(previous, current, blocks)) {
previous = combineBlocksAndResetIterator(previous, current, itty, previous.isToDuplicate());
continue;
}
if (isOnlyIntersectingYAndOnelineOrPrevoiusTwoLines(previous, current, page)) {
if (isOnlyIntersectingYAndOnelineOrPrevoiusTwoLines(previous, current, blocks)) {
previous = combineBlocksAndResetIterator(previous, current, itty, previous.isToDuplicate());
continue;
}
@ -144,43 +130,43 @@ public class DocstrumBlockificationService {
previous = current;
}
mergeIntersectingBlocks(page, usedRulings, 0, Y_THRESHOLD);
mergeIntersectingBlocks(blocks, rulingsWithoutTextRulings, 0, Y_THRESHOLD);
}
private boolean isOnlyIntersectingYAndOnelineOrPrevoiusTwoLines(TextPageBlock previous, TextPageBlock current, ClassificationPage page) {
private boolean isOnlyIntersectingYAndOnelineOrPrevoiusTwoLines(TextPageBlock previous, TextPageBlock current, List<? extends AbstractPageBlock> allBlocks) {
return current.intersectsY(previous) //
&& (previous.getNumberOfLines() == 1 && current.getNumberOfLines() >= 1 || previous.getNumberOfLines() == 2 && current.getNumberOfLines() == 1) //
&& numberOfYIntersectionsOfSmallerBlocksWithOtherBlocks(current, previous, page.getTextBlocks()) <= 0;
&& numberOfYIntersectionsOfSmallerBlocksWithOtherBlocks(current, previous, allBlocks) <= 0;
}
private boolean isSameTopOrBottomWithPreviousHavingMaxTwoLinesAndCurrentThanOneAndMax4OtherBlocksOnHeight(TextPageBlock previous,
TextPageBlock current,
ClassificationPage page) {
List<? extends AbstractPageBlock> allBlocks) {
return (Math.abs(previous.getMaxY() - current.getMaxY()) < THRESHOLD || Math.abs(previous.getMinY() - current.getMinY()) < THRESHOLD) //
&& (previous.getNumberOfLines() == 1 && current.getNumberOfLines() >= 1 || previous.getNumberOfLines() >= 2 && current.getNumberOfLines() == 1) //
&& !hasBetween(current, previous, page.getTextBlocks()) && numberOfYIntersectionsOfSmallerBlocksWithOtherBlocks(current, previous, page.getTextBlocks()) <= 4;
&& !hasBetween(current, previous, allBlocks) && numberOfYIntersectionsOfSmallerBlocksWithOtherBlocks(current, previous, allBlocks) <= 4;
}
private boolean intersectsYWithPreviousHavingMaxOneLine(TextPageBlock previous, TextPageBlock current, ClassificationPage page) {
private boolean intersectsYWithPreviousHavingMaxOneLine(TextPageBlock previous, TextPageBlock current) {
return previous.intersectsY(current) && (previous.getNumberOfLines() == 1 && current.getNumberOfLines() >= 1);
}
private boolean areTheOnlyTwoBlocksOnHeightsWithBothMoreThanTwoLines(TextPageBlock previous, TextPageBlock current, ClassificationPage page) {
private boolean areTheOnlyTwoBlocksOnHeightsWithBothMoreThanTwoLines(TextPageBlock previous, TextPageBlock current, List<TextPageBlock> allBlocks) {
return previous.getNumberOfLines() >= 2 && current.getNumberOfLines() >= 2 //
&& previous.intersectsY(current) //
&& numberOfYIntersectionsOfSmallerBlocksWithOtherBlocks(current, previous, page.getTextBlocks()) == 0;
&& numberOfYIntersectionsOfSmallerBlocksWithOtherBlocks(current, previous, allBlocks) == 0;
}
private TextPageBlock combineBlocksAndResetIterator(TextPageBlock previous, TextPageBlock current, ListIterator<AbstractPageBlock> itty, boolean toDuplicate) {
private TextPageBlock combineBlocksAndResetIterator(TextPageBlock previous, TextPageBlock current, ListIterator<TextPageBlock> itty, boolean toDuplicate) {
previous.addAll(current.getWords());
previous = buildTextBlock(previous.getWords(), 0);
@ -196,7 +182,7 @@ public class DocstrumBlockificationService {
}
private boolean hasBetween(TextPageBlock block, TextPageBlock other, List<AbstractPageBlock> allBlocks) {
private boolean hasBetween(TextPageBlock block, TextPageBlock other, List<? extends AbstractPageBlock> allBlocks) {
for (AbstractPageBlock current : allBlocks) {
@ -213,7 +199,7 @@ public class DocstrumBlockificationService {
}
private int numberOfYIntersectionsOfSmallerBlocksWithOtherBlocks(TextPageBlock block, TextPageBlock other, List<AbstractPageBlock> allBlocks) {
private int numberOfYIntersectionsOfSmallerBlocksWithOtherBlocks(TextPageBlock block, TextPageBlock other, List<? extends AbstractPageBlock> allBlocks) {
double minY = Math.min(block.getMinY(), other.getMinY());
double maxY = Math.min(block.getMaxY(), other.getMaxY());
@ -234,25 +220,18 @@ public class DocstrumBlockificationService {
}
public void mergeIntersectingBlocks(ClassificationPage page, CleanRulings usedRulings, float xThreshold, float yThreshold) {
public void mergeIntersectingBlocks(List<TextPageBlock> blocks, CleanRulings usedRulings, float xThreshold, float yThreshold) {
var blocks = page.getTextBlocks();
ListIterator<AbstractPageBlock> itty = blocks.listIterator();
ListIterator<TextPageBlock> itty = blocks.listIterator();
while (itty.hasNext()) {
AbstractPageBlock block = itty.next();
if (block == null) {
continue;
}
if (block instanceof TablePageBlock) {
TextPageBlock current = itty.next();
if (current == null) {
continue;
}
if (block.getClassification() != null && block.getClassification().isHeadline()) {
if (current.getClassification() != null && current.getClassification().isHeadline()) {
continue;
}
TextPageBlock current = (TextPageBlock) block;
for (int i = 0; i < blocks.size(); i++) {
AbstractPageBlock abstractPageBlock = blocks.get(i);

View File

@ -33,14 +33,14 @@ public class DocuMineBlockificationService {
* This method must use text direction adjusted postions (DirAdj). Where {0,0} is on the upper left. Never try to change this!
* Rulings (Table lines) must be adjusted to the text directions as well, when checking if a block is split by a ruling.
*
* @param textPositions The textPositions of a page.
* @param words The words of a page.
* @param cleanRulings All rulings on a page
* @return Page object that contains the Textblock and text statistics.
*/
public ClassificationPage blockify(List<Word> textPositions, CleanRulings cleanRulings) {
public List<TextPageBlock> blockify(List<Word> words, CleanRulings cleanRulings) {
List<Word> chunkWords = new ArrayList<>();
List<AbstractPageBlock> textPageBlocks = new ArrayList<>();
List<TextPageBlock> textPageBlocks = new ArrayList<>();
CleanRulings usedRulings = cleanRulings.withoutTextRulings();
@ -52,7 +52,7 @@ public class DocuMineBlockificationService {
boolean wasSplitted = false;
Double splitX1 = null;
for (Word word : textPositions) {
for (Word word : words) {
boolean lineSeparation = prev != null && word.getYDirAdj() - prev.getMaxYDirAdj() > Math.min(word.getHeight(), prev.getHeight()) * 1.1;
boolean startFromTop = prev != null && word.getYDirAdj() < prev.getYDirAdj() - prev.getTextHeight();
@ -120,7 +120,7 @@ public class DocuMineBlockificationService {
textPageBlocks.add(new TextPageBlock(chunkWords));
return new ClassificationPage(textPageBlocks);
return textPageBlocks;
}
@ -171,8 +171,9 @@ public class DocuMineBlockificationService {
continue;
}
if (current.getDir() == inner.getDir() && current.intersects(inner, yThreshold, xThreshold) && (current.getClassification() == null || current.getClassification()
.equals(inner.getClassification()))) {
if (current.getDir() == inner.getDir() &&//
current.intersects(inner, yThreshold, xThreshold) &&//
(current.getClassification() == null || current.getClassification().equals(inner.getClassification()))) {
boolean toDuplicate = current.isToDuplicate() || inner.isToDuplicate();
current.addAll(inner.getWords());

View File

@ -26,24 +26,24 @@ public class RedactManagerBlockificationService {
* This method must use text direction adjusted postions (DirAdj). Where {0,0} is on the upper left. Never try to change this!
* Rulings (TablePageBlock lines) must be adjusted to the text directions as well, when checking if a block is split by a ruling.
*
* @param textPositions The words of a page.
* @param words The words of a page.
* @param visualizations
* @return Page object that contains the Textblock and text statistics.
*/
public ClassificationPage blockify(List<Word> textPositions, CleanRulings cleanRulings, LayoutDebugLayer visualizations) {
public List<TextPageBlock> blockify(List<Word> words, CleanRulings cleanRulings, LayoutDebugLayer visualizations) {
CleanRulings usedRulings = cleanRulings.withoutTextRulings();
int indexOnPage = 0;
List<Word> chunkWords = new ArrayList<>();
List<AbstractPageBlock> chunkBlockList = new ArrayList<>();
List<TextPageBlock> chunkBlockList = new ArrayList<>();
double minX = 1000, maxX = 0, minY = 1000, maxY = 0;
Word prev = null;
boolean wasSplitted = false;
Double splitX1 = null;
for (Word word : textPositions) {
for (Word word : words) {
boolean lineSeparation = word.getYDirAdj() - maxY > word.getHeight() * 1.25;
boolean startFromTop = prev != null && word.getYDirAdj() < prev.getYDirAdj() - prev.getTextHeight();
@ -111,7 +111,7 @@ public class RedactManagerBlockificationService {
chunkBlockList.add(cb1);
}
Iterator<AbstractPageBlock> itty = chunkBlockList.iterator();
Iterator<TextPageBlock> itty = chunkBlockList.iterator();
TextPageBlock previousLeft = null;
TextPageBlock previousRight = null;
@ -159,12 +159,12 @@ public class RedactManagerBlockificationService {
previous = block;
}
if (!textPositions.isEmpty()) {
if (!words.isEmpty() && visualizations != null) {
visualizations.addTextBlockVisualizations(chunkBlockList.stream()
.toList(), textPositions.get(0).getPage());
.toList(), words.get(0).getPage());
}
return new ClassificationPage(chunkBlockList);
return chunkBlockList;
}

View File

@ -19,7 +19,7 @@ import java.util.stream.Collectors;
import org.springframework.stereotype.Service;
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.UnionFind;
import com.knecon.fforesight.service.layoutparser.processor.model.UnionFind;
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationDocument;
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;

View File

@ -5,7 +5,6 @@ import static java.util.stream.Collectors.groupingBy;
import static java.util.stream.Collectors.toList;
import java.awt.geom.Rectangle2D;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashMap;
import java.util.LinkedList;
@ -15,7 +14,6 @@ import java.util.NoSuchElementException;
import java.util.Objects;
import java.util.Optional;
import java.util.Set;
import java.util.stream.Collectors;
import com.iqser.red.service.redaction.v1.server.model.document.DocumentTree;
import com.iqser.red.service.redaction.v1.server.model.document.nodes.AbstractSemanticNode;
@ -36,8 +34,8 @@ import com.iqser.red.service.redaction.v1.server.model.document.textblock.TextBl
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType;
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationDocument;
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationFooter;
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationHeader;
import com.knecon.fforesight.service.layoutparser.processor.model.text.ClassificationFooter;
import com.knecon.fforesight.service.layoutparser.processor.model.text.ClassificationHeader;
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
import com.knecon.fforesight.service.layoutparser.processor.model.DocumentWithVisualization;
import com.knecon.fforesight.service.layoutparser.processor.model.PageBlockType;
@ -112,9 +110,7 @@ public class DocumentGraphFactory {
public void addParagraphOrHeadline(GenericSemanticNode parentNode,
TextPageBlock originalTextBlock,
Context context,
List<TextPageBlock> textBlocksToMerge,
LayoutParsingType layoutParsingType) {
Context context, LayoutParsingType layoutParsingType) {
Page page = context.getPage(originalTextBlock.getPage());
@ -129,17 +125,10 @@ public class DocumentGraphFactory {
node = Paragraph.builder().documentTree(context.getDocumentTree()).build();
}
List<TextPageBlock> textBlocks = new ArrayList<>();
textBlocks.add(originalTextBlock);
textBlocks.addAll(textBlocksToMerge);
AtomicTextBlock textBlock = context.textBlockFactory.buildAtomicTextBlock(TextPositionOperations.mergeAndSort(textBlocks), node, context, page);
AtomicTextBlock textBlock = context.textBlockFactory.buildAtomicTextBlock(TextPositionOperations.mergeAndSort(originalTextBlock), node, context, page);
if (node instanceof DuplicatedParagraph duplicatedParagraph) {
AtomicTextBlock unsortedTextBlock = context.textBlockFactory.buildAtomicTextBlock(textBlocks.stream()
.flatMap(tb -> tb.getWords()
.stream())
.collect(Collectors.toList()), node, context, page);
AtomicTextBlock unsortedTextBlock = context.textBlockFactory.buildAtomicTextBlock(originalTextBlock.getWords(), node, context, page);
duplicatedParagraph.setUnsortedLeafTextBlock(unsortedTextBlock);
}

View File

@ -29,19 +29,19 @@ public class SearchTextWithTextPositionFactory {
public static final double LINEBREAK_DELTA_TOLERANCE = 1.5;
public SearchTextWithTextPositionDto buildSearchTextToTextPositionDto(List<Word> sequences) {
public SearchTextWithTextPositionDto buildSearchTextToTextPositionDto(List<Word> words) {
if (sequences.isEmpty() || sequences.stream()
if (words.isEmpty() || words.stream()
.allMatch(sequence -> sequence.getCharacters().isEmpty())) {
return SearchTextWithTextPositionDto.empty();
}
Context context = new Context();
RedTextPosition currentTextPosition = sequences.get(0).getCharacters().get(0).getTextPosition();
RedTextPosition currentTextPosition = words.get(0).getCharacters().get(0).getTextPosition();
RedTextPosition previousTextPosition = RedTextPosition.builder().unicode(" ").bBoxDirAdj(currentTextPosition.getBBoxDirAdj()).build();
for (Word word : sequences) {
for (Word word : words) {
for (int i = 0; i < word.getCharacters().size(); ++i) {
currentTextPosition = word.getCharacters().get(i).getTextPosition();
@ -66,7 +66,7 @@ public class SearchTextWithTextPositionFactory {
++context.stringIdx;
}
List<Rectangle2D> positions = sequences.stream()
List<Rectangle2D> positions = words.stream()
.map(Word::getCharacters)
.flatMap(Collection::stream)
.map(Character::getTextPosition)

View File

@ -1,12 +1,12 @@
package com.knecon.fforesight.service.layoutparser.processor.services.factory;
import static java.lang.String.format;
import static java.util.Collections.emptyList;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.LinkedList;
import java.util.List;
import java.util.Objects;
import java.util.Optional;
import java.util.Set;
@ -17,12 +17,13 @@ import com.iqser.red.service.redaction.v1.server.model.document.nodes.Section;
import com.iqser.red.service.redaction.v1.server.model.document.nodes.SuperSection;
import com.iqser.red.service.redaction.v1.server.model.document.nodes.TableOfContents;
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType;
import com.knecon.fforesight.service.layoutparser.processor.model.UnionFind;
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.image.ClassifiedImage;
import com.knecon.fforesight.service.layoutparser.processor.model.outline.SectionTreeEntry;
import com.knecon.fforesight.service.layoutparser.processor.model.table.TablePageBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.utils.TableMergingUtility;
import com.knecon.fforesight.service.layoutparser.processor.services.tables.TableMergingUtility;
import lombok.experimental.UtilityClass;
@ -60,7 +61,7 @@ public class SectionNodeFactory {
section.setTreeId(getTreeId(parentNode, context, section));
addFirstHeadlineDirectlyToSection(layoutParsingType, pageBlocks, context, section, document);
addFirstHeadlineDirectlyToSection(layoutParsingType, pageBlocks, context, section);
boolean containsTablesAndTextBlocks = containsTablesAndTextBlocks(pageBlocks);
if (containsTablesAndTextBlocks) {
splitPageBlocksIntoSubSections(pageBlocks).forEach(subSectionPageBlocks -> addSection(layoutParsingType,
@ -73,8 +74,13 @@ public class SectionNodeFactory {
} else if (type.equals(SectionTreeEntry.Type.SUPER_SECTION)) {
// If a SuperSection contains more blocks than just a headline, we add a Section which contains the remaining textblocks.
addSection(layoutParsingType, section, SectionTreeEntry.Type.SECTION, pageBlocks, emptyList(), context, document);
} else if (!pageBlocks.isEmpty() && pageBlocks.get(0) instanceof TextPageBlock) {
List<TextPageBlock> textPageBlocks = pageBlocks.stream()
.map(block -> (TextPageBlock) block)
.toList();
addParagraphsAndHeadlinesToSection(layoutParsingType, textPageBlocks, context, section);
} else {
addTablesAndParagraphsAndHeadlinesToSection(layoutParsingType, pageBlocks, context, section, document);
addTablesToSection(pageBlocks, context, section, document, layoutParsingType);
}
images.stream()
@ -85,6 +91,28 @@ public class SectionNodeFactory {
}
private static void addTablesToSection(List<AbstractPageBlock> pageBlocks,
DocumentGraphFactory.Context context,
AbstractSemanticNode section,
Document document,
LayoutParsingType layoutParsingType) {
List<AbstractPageBlock> remainingBlocks = new ArrayList<>(pageBlocks);
Set<AbstractPageBlock> alreadyMerged = new HashSet<>();
for (AbstractPageBlock abstractPageBlock : pageBlocks) {
if (alreadyMerged.contains(abstractPageBlock)) {
continue;
}
if (abstractPageBlock instanceof TablePageBlock tablePageBlock) {
List<TablePageBlock> tablesToMerge = TableMergingUtility.findConsecutiveTablesWithSameColCountAndSameHeaders(tablePageBlock, remainingBlocks);
alreadyMerged.addAll(tablesToMerge);
remainingBlocks.removeAll(tablesToMerge);
TableNodeFactory.addTable(layoutParsingType, section, tablesToMerge, context, document);
}
}
}
private List<Integer> getTreeId(GenericSemanticNode parentNode, DocumentGraphFactory.Context context, AbstractSemanticNode section) {
if (parentNode == null) {
@ -98,54 +126,63 @@ public class SectionNodeFactory {
private void addFirstHeadlineDirectlyToSection(LayoutParsingType layoutParsingType,
List<AbstractPageBlock> pageBlocks,
DocumentGraphFactory.Context context,
AbstractSemanticNode section,
Document document) {
AbstractSemanticNode section) {
if (pageBlocks.get(0).isHeadline()) {
addTablesAndParagraphsAndHeadlinesToSection(layoutParsingType, List.of(pageBlocks.get(0)), context, section, document);
addParagraphsAndHeadlinesToSection(layoutParsingType, List.of((TextPageBlock) pageBlocks.get(0)), context, section);
pageBlocks.remove(0);
}
}
private void addTablesAndParagraphsAndHeadlinesToSection(LayoutParsingType layoutParsingType,
List<AbstractPageBlock> pageBlocks,
DocumentGraphFactory.Context context,
AbstractSemanticNode section,
Document document) {
private void addParagraphsAndHeadlinesToSection(LayoutParsingType layoutParsingType,
List<TextPageBlock> pageBlocks,
DocumentGraphFactory.Context context,
AbstractSemanticNode section) {
Set<AbstractPageBlock> alreadyMerged = new HashSet<>();
List<AbstractPageBlock> remainingBlocks = new LinkedList<>(pageBlocks);
for (AbstractPageBlock abstractPageBlock : pageBlocks) {
List<TextPageBlock> mergedPageBlocks = pageBlocks;
if (pageBlocks.size() > 1 && (layoutParsingType.equals(LayoutParsingType.DOCUMINE_OLD) || layoutParsingType.equals(LayoutParsingType.REDACT_MANAGER_OLD))) {
mergedPageBlocks = mergeBlocks(pageBlocks);
}
if (alreadyMerged.contains(abstractPageBlock)) {
continue;
}
for (TextPageBlock textPageBlock : mergedPageBlocks) {
DocumentGraphFactory.addParagraphOrHeadline(section, textPageBlock, context, layoutParsingType);
}
}
remainingBlocks.removeAll(alreadyMerged);
if (abstractPageBlock instanceof TextPageBlock) {
private static List<TextPageBlock> mergeBlocks(List<TextPageBlock> pageBlocks) {
switch (layoutParsingType) {
case REDACT_MANAGER, DOCUMINE, CLARIFYND, CLARIFYND_PARAGRAPH_DEBUG, REDACT_MANAGER_WITHOUT_DUPLICATE_PARAGRAPH -> {
alreadyMerged.add(abstractPageBlock);
remainingBlocks.remove(abstractPageBlock);
DocumentGraphFactory.addParagraphOrHeadline(section, (TextPageBlock) abstractPageBlock, context, new ArrayList<>(), layoutParsingType);
}
default -> {
List<TextPageBlock> textBlocks = findTextBlocksWithSameClassificationAndAlignsY((TextPageBlock) abstractPageBlock, remainingBlocks);
alreadyMerged.addAll(textBlocks);
DocumentGraphFactory.addParagraphOrHeadline(section, (TextPageBlock) abstractPageBlock, context, textBlocks, layoutParsingType);
}
UnionFind<TextPageBlock> blockUnionFind = new UnionFind<>(new HashSet<>(pageBlocks));
for (int i = 0; i < pageBlocks.size(); i++) {
TextPageBlock textPageBlock1 = pageBlocks.get(i);
for (int j = i; j < pageBlocks.size(); j++) {
if (i == j) {
continue;
}
} else if (abstractPageBlock instanceof TablePageBlock tablePageBlock) {
List<TablePageBlock> tablesToMerge = TableMergingUtility.findConsecutiveTablesWithSameColCountAndSameHeaders(tablePageBlock, remainingBlocks);
alreadyMerged.addAll(tablesToMerge);
TableNodeFactory.addTable(layoutParsingType, section, tablesToMerge, context, document);
} else {
throw new RuntimeException(format("Unhandled AbstractPageBlockType %s!", abstractPageBlock.getClass()));
var textPageBlock2 = pageBlocks.get(j);
if (!Objects.equals(textPageBlock2.getPage(), textPageBlock1.getPage())) {
continue;
}
if (!Objects.equals(textPageBlock2.getDir(), textPageBlock1.getDir())) {
continue;
}
if (!Objects.equals(textPageBlock2.getClassification(), textPageBlock1.getClassification())) {
continue;
}
if (!textPageBlock2.intersectsYPdf(textPageBlock1)) {
continue;
}
if (textPageBlock2.isToDuplicate()) {
continue;
}
blockUnionFind.union(textPageBlock2, textPageBlock1);
}
}
return blockUnionFind.getGroups()
.stream()
.map(TextPageBlock::merge)
.toList();
}
@ -222,18 +259,4 @@ public class SectionNodeFactory {
return splitList;
}
private List<TextPageBlock> findTextBlocksWithSameClassificationAndAlignsY(TextPageBlock atc, List<AbstractPageBlock> pageBlocks) {
return pageBlocks.stream()
.filter(abstractTextContainer -> !abstractTextContainer.equals(atc))
.filter(abstractTextContainer -> abstractTextContainer.getPage() == atc.getPage())
.filter(abstractTextContainer -> abstractTextContainer instanceof TextPageBlock)
.filter(abstractTextContainer -> abstractTextContainer.intersectsYPdf(atc))
.map(abstractTextContainer -> (TextPageBlock) abstractTextContainer)
.filter(abstractTextContainer -> abstractTextContainer.getDir() == atc.getDir())
.filter(abstractTextContainer -> !abstractTextContainer.isToDuplicate())
.toList();
}
}

View File

@ -4,7 +4,6 @@ import static java.util.Collections.emptyList;
import java.util.Collection;
import java.util.List;
import java.util.stream.Collectors;
import com.iqser.red.service.redaction.v1.server.model.document.nodes.Document;
import com.iqser.red.service.redaction.v1.server.model.document.nodes.GenericSemanticNode;
@ -17,6 +16,7 @@ import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBl
import com.knecon.fforesight.service.layoutparser.processor.model.outline.SectionTreeEntry;
import com.knecon.fforesight.service.layoutparser.processor.model.table.Cell;
import com.knecon.fforesight.service.layoutparser.processor.model.table.TablePageBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.text.Word;
import com.knecon.fforesight.service.layoutparser.processor.utils.TextPositionOperations;
@ -50,8 +50,6 @@ public class TableNodeFactory {
List<Integer> treeId = context.getDocumentTree().createNewChildEntryAndReturnId(parentNode, table);
table.setTreeId(treeId);
addTableCells(layoutParsingType, mergedRows, table, context, document);
ifTableHasNoHeadersSetFirstRowAsHeaders(table);
}
@ -76,16 +74,6 @@ public class TableNodeFactory {
}
private void ifTableHasNoHeadersSetFirstRowAsHeaders(Table table) {
if (table.streamHeaders()
.findAny().isEmpty()) {
table.streamRow(0)
.forEach(tableCellNode -> tableCellNode.setHeader(true));
}
}
private void addTableCells(LayoutParsingType layoutParsingType, List<List<Cell>> rows, Table table, DocumentGraphFactory.Context context, Document document) {
for (int rowIndex = 0; rowIndex < rows.size(); rowIndex++) {
@ -115,32 +103,32 @@ public class TableNodeFactory {
TextBlock textBlock;
if (cell.getTextBlocks().isEmpty()) {
tableCell.setLeafTextBlock(context.getTextBlockFactory().emptyTextBlock(tableNode, context, page));
} else if (cell.getTextBlocks().size() == 1) {
textBlock = context.getTextBlockFactory().buildAtomicTextBlock(cell.getTextBlocks().get(0).getWords(), tableCell, context, page);
} else if (cell.getTextBlocks().size() == 1 && cell.getTextBlocks().get(0) instanceof TextPageBlock textPageBlock) {
textBlock = context.getTextBlockFactory().buildAtomicTextBlock(textPageBlock.getWords(), tableCell, context, page);
tableCell.setLeafTextBlock(textBlock);
} else if (firstTextBlockIsHeadline(cell)) {
SectionNodeFactory.addSection(layoutParsingType,
tableCell,
SectionTreeEntry.Type.SECTION,
cell.getTextBlocks()
.stream()
.map(tb -> (AbstractPageBlock) tb)
.collect(Collectors.toList()),
emptyList(),
context,
document);
} else if (cellAreaIsSmallerThanPageAreaTimesThreshold(cell, page)) {
List<Word> sequences = TextPositionOperations.mergeAndSort(cell.getTextBlocks());
textBlock = context.getTextBlockFactory().buildAtomicTextBlock(sequences, tableCell, context, page);
} else if (firstTextBlockIsHeadline(cell) || containsTables(cell.getTextBlocks())) {
SectionNodeFactory.addSection(layoutParsingType, tableCell, SectionTreeEntry.Type.SECTION, cell.getTextBlocks(), emptyList(), context, document);
} else if (cellAreaIsSmallerThanThreshold(cell, page)) {
List<Word> words = TextPositionOperations.sort(cell.getWords());
textBlock = context.getTextBlockFactory().buildAtomicTextBlock(words, tableCell, context, page);
tableCell.setLeafTextBlock(textBlock);
} else {
cell.getTextBlocks()
.forEach(tb -> DocumentGraphFactory.addParagraphOrHeadline(tableCell, tb, context, emptyList(), layoutParsingType));
.stream()
.map(block -> (TextPageBlock) block)
.forEach(tb -> DocumentGraphFactory.addParagraphOrHeadline(tableCell, tb, context, layoutParsingType));
}
}
private boolean cellAreaIsSmallerThanPageAreaTimesThreshold(Cell cell, Page page) {
private boolean containsTables(List<AbstractPageBlock> pageBlocks) {
return pageBlocks.stream()
.anyMatch(pageBlock -> pageBlock instanceof TablePageBlock);
}
private boolean cellAreaIsSmallerThanThreshold(Cell cell, Page page) {
return cell.getArea() < TABLE_CELL_MERGE_CONTENTS_SIZE_THRESHOLD * page.getHeight() * page.getWidth();
}

View File

@ -18,16 +18,16 @@ public class TextBlockFactory {
long textBlockIdx;
public AtomicTextBlock buildAtomicTextBlock(List<Word> sequences, SemanticNode parent, DocumentGraphFactory.Context context, Page page) {
public AtomicTextBlock buildAtomicTextBlock(List<Word> words, SemanticNode parent, DocumentGraphFactory.Context context, Page page) {
Integer numberOnPage = context.getAndIncrementTextBlockNumberOnPage(page);
return buildAtomicTextBlock(sequences, parent, numberOnPage, page);
return buildAtomicTextBlock(words, parent, numberOnPage, page);
}
public AtomicTextBlock buildAtomicTextBlock(List<Word> sequences, SemanticNode parent, Integer numberOnPage, Page page) {
public AtomicTextBlock buildAtomicTextBlock(List<Word> words, SemanticNode parent, Integer numberOnPage, Page page) {
SearchTextWithTextPositionDto searchTextWithTextPositionDto = SearchTextWithTextPositionFactory.buildSearchTextToTextPositionDto(sequences);
SearchTextWithTextPositionDto searchTextWithTextPositionDto = SearchTextWithTextPositionFactory.buildSearchTextToTextPositionDto(words);
int offset = stringOffset;
stringOffset += searchTextWithTextPositionDto.getSearchText().length();
long idx = textBlockIdx;

View File

@ -11,14 +11,15 @@ import java.util.stream.Collectors;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.rendering.ImageType;
import org.apache.pdfbox.rendering.PDFRenderer;
import org.springframework.stereotype.Service;
import com.knecon.fforesight.service.layoutparser.processor.utils.CoordinateTransforms;
import com.knecon.fforesight.service.layoutparser.processor.utils.PageInformation;
import com.knecon.fforesight.service.layoutparser.processor.model.PageInformation;
import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations;
import lombok.SneakyThrows;
import lombok.experimental.UtilityClass;
@Service
@UtilityClass
public class FindGraphicsRaster {
// Pixels that are lighter then this threshold are ignored
@ -33,7 +34,8 @@ public class FindGraphicsRaster {
var renderer = new PDFRenderer(doc);
var img = renderer.renderImageWithDPI(pageInformation.number() - 1, DPI, ImageType.GRAY);
var imageCtm = CoordinateTransforms.calculateImageCoordsToInitialUserSpaceCoords(pageInformation, CoordinateTransforms.calculateScalingFactor(pageInformation, img.getWidth()));
var imageCtm = CoordinateTransforms.calculateImageCoordsToInitialUserSpaceCoords(pageInformation,
CoordinateTransforms.calculateScalingFactor(pageInformation, img.getWidth()));
return findCCBoundingBoxes(img, remove, THRESHOLD, DPI / 72, imageCtm);
}
@ -47,13 +49,15 @@ public class FindGraphicsRaster {
var w = image.getWidth();
var pixels = new int[w * h];
image.getRaster().getPixels(0, 0, w, h, pixels);
remove.stream().map(rect -> inverseCTM.createTransformedShape(rect).getBounds2D()).forEach(box -> {
for (int y = (int) Math.floor(box.getMinY() / rescale); y <= (int) Math.min(Math.ceil(box.getMaxY() / rescale), h); y++) {
for (int x = (int) Math.floor(box.getMinX() / rescale); x <= (int) Math.min(Math.ceil(box.getMaxX() / rescale), w); x++) {
pixels[w * y + x] = grayScaleTresh;
}
}
});
remove.stream()
.map(rect -> RectangleTransformations.transform(rect, inverseCTM))
.forEach(box -> {
for (int y = (int) Math.floor(box.getMinY() / rescale); y <= (int) Math.min(Math.ceil(box.getMaxY() / rescale), h); y++) {
for (int x = (int) Math.floor(box.getMinX() / rescale); x <= (int) Math.min(Math.ceil(box.getMaxX() / rescale), w); x++) {
pixels[w * y + x] = grayScaleTresh;
}
}
});
// var image2 = createImageFromMatrix(pixels, w, h);
@ -130,8 +134,10 @@ public class FindGraphicsRaster {
}
}
}
return boundingBoxes.stream().filter(box -> box.area() > 0).map(box -> box.transform(imageCTM)).collect(Collectors.toList());
return boundingBoxes.stream()
.filter(box -> box.area() > 0)
.map(box -> box.transform(imageCTM))
.collect(Collectors.toList());
}
}

View File

@ -4,15 +4,14 @@ import java.awt.geom.Rectangle2D;
import java.util.List;
import java.util.stream.Collectors;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
import org.springframework.stereotype.Service;
import com.iqser.red.service.redaction.v1.server.model.document.nodes.ImageType;
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.BoundingBox;
import com.knecon.fforesight.service.layoutparser.processor.model.image.ClassifiedImage;
import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings;
import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling;
import com.knecon.fforesight.service.layoutparser.processor.model.text.Word;
import com.knecon.fforesight.service.layoutparser.processor.utils.PageInformation;
import lombok.RequiredArgsConstructor;
import lombok.SneakyThrows;
@ -25,32 +24,13 @@ public class GraphicExtractorService {
private static final int MIN_GRAPHICS_AREA = 500;
private final GraphicsClusteringService graphicsClusteringService;
private final FindGraphicsRaster findGraphicsRaster;
@SneakyThrows
public List<Box> extractPathElementGraphics(PDDocument pdDocument,
PDPage pdPage,
int pageNumber,
CleanRulings cleanRulings,
List<Word> words,
boolean graphicsRaster) {
public List<ClassifiedImage> extractPathElementGraphics(List<Box> graphicBBoxes, int pageNumber, CleanRulings cleanRulings) {
List<Box> characterBBoxes = getCharacterBBoxes(words);
List<Box> classifiedRulingsBoxes = getLineBBoxesOfAllClassifiedRulings(cleanRulings);
GraphicBBDetector graphicBBDetector = new GraphicBBDetector(pdPage, true);
List<Box> graphicBBoxes = graphicBBDetector.findGraphicBB();
if (graphicsRaster) {
// This should only be used if ocr was performed, it is currently in an early stage and needs to be improved.
graphicBBoxes.addAll(findGraphicsRaster.findCCBoundingBoxes(pdDocument,
characterBBoxes.stream()
.map(box -> new Rectangle2D.Double(box.x1 - 2, box.y1 - 2, box.width() + 4, box.height() + 4))
.collect(Collectors.toList()),
PageInformation.fromPDPage(pageNumber, pdPage)));
}
List<Box> filteredGraphicBBoxes = graphicBBoxes.stream()
.filter(box -> !box.intersectsAny(classifiedRulingsBoxes, 4))
.collect(Collectors.toList());
@ -59,19 +39,11 @@ public class GraphicExtractorService {
return clusters.stream()
.filter(box -> box.area() > MIN_GRAPHICS_AREA && box.height() > MIN_GRAPHICS_SIDE_LENGTH && box.width() > MIN_GRAPHICS_SIDE_LENGTH)
.map(g -> new ClassifiedImage(new Rectangle2D.Double(g.x1, g.y1, g.width(), g.height()), ImageType.GRAPHIC, false, pageNumber, ""))
.toList();
}
private List<Box> getCharacterBBoxes(List<Word> words) {
return words.stream()
.map(BoundingBox::getBBoxPdf)
.map(Box::new)
.collect(Collectors.toList());
}
private List<Box> getLineBBoxesOfAllClassifiedRulings(CleanRulings cleanRulings) {
return cleanRulings.buildAll()

View File

@ -14,7 +14,7 @@ import com.iqser.red.service.redaction.v1.server.model.document.nodes.Section;
import com.iqser.red.service.redaction.v1.server.model.document.nodes.SemanticNode;
import com.iqser.red.service.redaction.v1.server.model.document.nodes.SuperSection;
import com.knecon.fforesight.service.layoutparser.processor.utils.CoordinateTransforms;
import com.knecon.fforesight.service.layoutparser.processor.utils.PageInformation;
import com.knecon.fforesight.service.layoutparser.processor.model.PageInformation;
import com.knecon.fforesight.service.viewerdoc.model.Outline;
import lombok.SneakyThrows;

View File

@ -199,11 +199,18 @@ public class PDFLinesTextStripper extends PDFTextStripper {
}
private void addVisibleRulings(List<Ruling> path, boolean stroke) throws IOException {
private void addVisibleRulings(List<Ruling> path, boolean stroke) {
try {
if (stroke && !getGraphicsState().getStrokingColor().isPattern() && isBlack(getGraphicsState().getStrokingColor()) || //
!stroke && !getGraphicsState().getNonStrokingColor().isPattern() && isBlack(getGraphicsState().getNonStrokingColor())) {
// see spec '8.4.3.6 Line dash pattern'
var dashPattern = getGraphicsState().getLineDashPattern();
if (dashPattern != null && dashPattern.getDashArray().length > 0) {
path.forEach(r -> r.setStyle(Ruling.Style.DASHED));
} else {
path.forEach(r -> r.setStyle(Ruling.Style.SOLID));
}
rulings.addAll(path);
}
} catch (UnsupportedOperationException e) {
@ -247,9 +254,7 @@ public class PDFLinesTextStripper extends PDFTextStripper {
}
if (!words.isEmpty()) {
previous = words.get(words.size() - 1)
.getCharacters()
.get(words.get(words.size() - 1).getCharacters().size() - 1).getTextPosition();
previous = words.get(words.size() - 1).getCharacters().get(words.get(words.size() - 1).getCharacters().size() - 1).getTextPosition();
}
if (i == 0 && (textPositions.get(i).getUnicode().equals(" ") || textPositions.get(i).getUnicode().equals("\u00A0") || textPositions.get(i).getUnicode().equals("\t"))) {

View File

@ -0,0 +1,138 @@
package com.knecon.fforesight.service.layoutparser.processor.services.tables;
import java.awt.geom.AffineTransform;
import java.awt.geom.Point2D;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Comparator;
import java.util.HashSet;
import java.util.List;
import java.util.Optional;
import java.util.Set;
import com.knecon.fforesight.service.layoutparser.processor.model.UnionFind;
import com.knecon.fforesight.service.layoutparser.processor.model.table.Cell;
import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations;
import lombok.experimental.UtilityClass;
@UtilityClass
public class AreaSweepGridifier {
public static final double CELL_AREA_CONTAINED_THRESHOLD = 0.8;
public static final double MIN_SIZE_FACTOR = 0.5;
/**
* Calculates the grid structure of the table. For spanning rows and columns multiple cells with the same values will be inserted.
* Works well for perfectly straight tables, but fails as soon as the tables are slightly rotated. Then the area sweep will drop some cells or duplicate them unnecessarily.
*
* @return TablePageBlock Structure as a rows of cells matrix
*/
public List<List<Cell>> gridify(Set<Cell> cells, AffineTransform pageToPdfTransform, double minCellWidth, double minCellHeight) {
if (cells.isEmpty()) {
return new ArrayList<>();
}
var colDividers = getColDividers(cells, minCellWidth);
var rowDividers = getRowDividers(cells, minCellHeight);
List<List<Cell>> rowsOfCells = new ArrayList<>();
for (int i = 1; i < rowDividers.size(); i++) {
double prevY = rowDividers.get(i - 1);
double y = rowDividers.get(i);
List<Cell> row = new ArrayList<>();
for (int j = 1; j < colDividers.size(); j++) {
double prevX = colDividers.get(j - 1);
double x = colDividers.get(j);
var cellFromGridStructure = Cell.fromPageCoordinates(new Point2D.Double(prevX, prevY), new Point2D.Double(x, y), pageToPdfTransform);
if (!cellFromGridStructure.hasMinimumSize()) {
continue;
}
Optional<Cell> matchingCell = cells.stream()
.map(originalCell -> new CellWithIntersection(originalCell,
RectangleTransformations.calculateIntersectedArea(cellFromGridStructure.getBBox(), originalCell.getBBox())))
.filter(cellWithIntersection -> cellWithIntersection.intersectedArea() > 0)
.filter(cellWithIntersection -> cellFromGridStructure.getArea() > cellWithIntersection.intersectedArea * CELL_AREA_CONTAINED_THRESHOLD)
.max(Comparator.comparing(CellWithIntersection::intersectedArea))
.map(CellWithIntersection::originalCell);
if (matchingCell.isPresent()) {
cellFromGridStructure.getTextBlocks().addAll(matchingCell.get().getTextBlocks());
cellFromGridStructure.setHeaderCell(matchingCell.get().isHeaderCell());
}
row.add(cellFromGridStructure);
}
rowsOfCells.add(row);
}
return rowsOfCells;
}
private List<Double> getRowDividers(Collection<Cell> cells, double minCellHeight) {
Set<Double> uniqueY = new HashSet<>();
cells.stream()
.filter(c -> !c.getTextBlocks().isEmpty() || c.getHeight() > 3 && c.getWidth() > 3)
.forEach(c -> {
uniqueY.add(c.getMinY());
uniqueY.add(c.getMaxY());
});
return deduplicate(uniqueY, minCellHeight * MIN_SIZE_FACTOR);
}
private List<Double> getColDividers(Collection<Cell> cells, double minCellWidth) {
Set<Double> uniqueX = new HashSet<>();
cells.stream()
.filter(c -> !c.getTextBlocks().isEmpty() || c.getHeight() > 3 && c.getWidth() > 3)
.forEach(c -> {
uniqueX.add(c.getMinX());
uniqueX.add(c.getMaxX());
});
return deduplicate(uniqueX, minCellWidth * MIN_SIZE_FACTOR);
}
private List<Double> deduplicate(Set<Double> doubles, double minDistance) {
// finds all doubles less than the minDistance apart and replaces them with their average
UnionFind<Double> uf = new UnionFind<>(doubles);
for (Double x : doubles) {
for (Double x2 : doubles) {
if (x.equals(x2)) {
continue;
}
if (Math.abs(x - x2) < minDistance) {
uf.union(x, x2);
}
}
}
return uf.getGroups()
.stream()
.map(xs -> xs.stream()
.mapToDouble(Double::doubleValue).average()
.orElseThrow())
.sorted()
.toList();
}
record CellWithIntersection(Cell originalCell, double intersectedArea) {
}
}

View File

@ -1,4 +1,4 @@
package com.knecon.fforesight.service.layoutparser.processor.utils;
package com.knecon.fforesight.service.layoutparser.processor.services.tables;
import static com.knecon.fforesight.service.layoutparser.processor.utils.GeometricComparators.Y_FIRST_POINT_COMPARATOR;
@ -14,15 +14,6 @@ public class RectangularIntersectionFinder {
public static List<Rectangle2D> find(List<Ruling> horizontalRulingLines, List<Ruling> verticalRulingLines) {
// // Fix for 211.pdf
// for (Ruling r : horizontalRulingLines) {
// if (r.getX2() < r.getX1()) {
// double a = r.getX2();
// r.x2 = (float) r.getX1();
// r.x1 = (float) a;
// }
// }
List<Rectangle2D> foundRectangles = new ArrayList<>();
Map<Point2D, RulingIntersectionFinder.IntersectingRulings> intersectionPoints = RulingIntersectionFinder.findNaive(horizontalRulingLines, verticalRulingLines);

View File

@ -1,4 +1,4 @@
package com.knecon.fforesight.service.layoutparser.processor.utils;
package com.knecon.fforesight.service.layoutparser.processor.services.tables;
import java.awt.geom.Point2D;
import java.util.Collections;
@ -10,6 +10,7 @@ import java.util.Optional;
import java.util.TreeMap;
import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling;
import com.knecon.fforesight.service.layoutparser.processor.utils.DoubleComparisons;
import lombok.experimental.UtilityClass;
import lombok.extern.slf4j.Slf4j;
@ -33,7 +34,7 @@ public class RulingIntersectionFinder {
*/
/*
* The algorithm assumes there are only horizontal and vertical lines which are unique in their coordinates. (E.g. no overlapping horizontal lines exist)
* As a high level overview, the algorithm uses a sweep line advancing from left to right.
* As a high level overview, the algorithm uses a sweep line advancing from lefts to rights.
* It dynamically updates the horizontal rulings which are intersected by the current sweep line.
* When the sweep line hits a vertical line, it then checks for all intersections with the currently intersected horizontal rulings.
* THe trick of the algorithm is using a binary search tree to store the currently intersected horizontal rulings. This way the lookup should be in O(log n).

View File

@ -1,4 +1,4 @@
package com.knecon.fforesight.service.layoutparser.processor.utils;
package com.knecon.fforesight.service.layoutparser.processor.services.tables;
import java.awt.geom.Line2D;
import java.awt.geom.Point2D;
@ -12,7 +12,7 @@ public final class RulingTextDirAdjustUtil {
/**
* Converts a ruling (line of a table) the same way TextPositions are converted in PDFBox.
* This will get the y position of the text, adjusted so that 0,0 is upper left and it is adjusted based on the text direction.
* This will get the y position of the text, adjusted so that 0,0 is upper lefts and it is adjusted based on the text direction.
* <p>
* See org.apache.pdfbox.text.TextPosition
*/

View File

@ -0,0 +1,109 @@
package com.knecon.fforesight.service.layoutparser.processor.services.tables;
import java.awt.geom.AffineTransform;
import java.awt.geom.Rectangle2D;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import java.util.stream.Collectors;
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.BoundingBox;
import com.knecon.fforesight.service.layoutparser.processor.model.table.Cell;
import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations;
import lombok.SneakyThrows;
import lombok.experimental.UtilityClass;
@UtilityClass
public class TableAreaFiller {
public Set<Cell> findMissingCells(List<Cell> cells, Rectangle2D areaPDF, AffineTransform pdfToPageTransform) {
var area = RectangleTransformations.transform(areaPDF, pdfToPageTransform);
List<Rectangle2D> rectangles = cells.stream()
.map(BoundingBox::getBBox)
.toList();
Set<Rectangle2D> unfilledRects = findMissingRects(rectangles, area);
AffineTransform pageToPdfTransform = getInverse(pdfToPageTransform);
return unfilledRects.stream()
.map(rect -> Cell.fromPageCoordinates(rect, pageToPdfTransform))
.collect(Collectors.toSet());
}
public static Set<Rectangle2D> findMissingRects(List<Rectangle2D> rectangles, Rectangle2D area) {
double minWidth = rectangles.stream()
.mapToDouble(Rectangle2D::getWidth)
.min().orElse(0) * 0.95;
double minHeight = rectangles.stream()
.mapToDouble(Rectangle2D::getHeight)
.min().orElse(0) * 0.95;
Set<Rectangle2D> unfilledRects = new HashSet<>();
unfilledRects.add(area);
for (Rectangle2D rectangle : rectangles) {
unfilledRects = fillWithRectangle(unfilledRects, rectangle, minWidth, minHeight);
}
return unfilledRects;
}
private Set<Rectangle2D> fillWithRectangle(Set<Rectangle2D> unfilledRects, Rectangle2D rectToAdd, double minWidth, double minHeight) {
Set<Rectangle2D> remainingUnfilledRects = new HashSet<>();
for (Rectangle2D unfilledRect : unfilledRects) {
if (!rectToAdd.intersects(unfilledRect)) {
remainingUnfilledRects.add(unfilledRect);
continue;
}
boolean topAdded = false;
boolean bottomAdded = false;
// Top rectangle
double topHeight = rectToAdd.getY() - unfilledRect.getY();
if (topHeight > minHeight) {
topAdded = true;
Rectangle2D topRect = new Rectangle2D.Double(unfilledRect.getX(), unfilledRect.getY(), unfilledRect.getWidth(), topHeight);
remainingUnfilledRects.add(topRect);
}
// Bottom rectangle
double bottomHeight = unfilledRect.getMaxY() - rectToAdd.getMaxY();
if (bottomHeight > minHeight) {
bottomAdded = true;
Rectangle2D bottomRect = new Rectangle2D.Double(unfilledRect.getX(), rectToAdd.getMaxY(), unfilledRect.getWidth(), bottomHeight);
remainingUnfilledRects.add(bottomRect);
}
double y = topAdded ? rectToAdd.getY() : unfilledRect.getY();
double maxY = bottomAdded ? rectToAdd.getMaxY() : unfilledRect.getMaxY();
double height = maxY - y;
// Left rectangle
double leftWidth = rectToAdd.getX() - unfilledRect.getX();
if (leftWidth > minWidth) {
Rectangle2D leftRect = new Rectangle2D.Double(unfilledRect.getX(), y, leftWidth, height);
remainingUnfilledRects.add(leftRect);
}
// Right rectangle
double rightWidth = unfilledRect.getMaxX() - rectToAdd.getMaxX();
if (rightWidth > minWidth) {
Rectangle2D rightRect = new Rectangle2D.Double(rectToAdd.getMaxX(), y, rightWidth, height);
remainingUnfilledRects.add(rightRect);
}
}
return remainingUnfilledRects;
}
@SneakyThrows
private static AffineTransform getInverse(AffineTransform pdfToPageTransform) {
return pdfToPageTransform.createInverse();
}
}

View File

@ -0,0 +1,270 @@
package com.knecon.fforesight.service.layoutparser.processor.services.tables;
import static com.knecon.fforesight.service.layoutparser.processor.utils.GeometricComparators.CELL_SIZE_COMPARATOR;
import static com.knecon.fforesight.service.layoutparser.processor.utils.GeometricComparators.RECTANGLE_SIZE_COMPARATOR;
import java.awt.Color;
import java.awt.geom.AffineTransform;
import java.awt.geom.Point2D;
import java.awt.geom.Rectangle2D;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.HashSet;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Objects;
import java.util.Set;
import java.util.function.Function;
import java.util.stream.Collectors;
import org.springframework.stereotype.Service;
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType;
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.BoundingBox;
import com.knecon.fforesight.service.layoutparser.processor.docstrum.service.ReadingOrderService;
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.PageInformation;
import com.knecon.fforesight.service.layoutparser.processor.model.table.Cell;
import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings;
import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling;
import com.knecon.fforesight.service.layoutparser.processor.model.table.TablePageBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.text.Word;
import com.knecon.fforesight.service.layoutparser.processor.services.blockification.BlockificationService;
import com.knecon.fforesight.service.layoutparser.processor.utils.CoordinateTransforms;
import com.knecon.fforesight.service.layoutparser.processor.utils.DoubleComparisons;
import com.knecon.fforesight.service.layoutparser.processor.utils.GeometricComparators;
import com.knecon.fforesight.service.layoutparser.processor.utils.SpreadsheetFinder;
import com.knecon.fforesight.service.layoutparser.processor.visualization.LayoutDebugLayer;
import com.knecon.fforesight.service.ocr.v1.api.model.Table;
import com.knecon.fforesight.service.ocr.v1.api.model.TableCell;
import com.knecon.fforesight.service.ocr.v1.api.model.TableCellType;
import lombok.AccessLevel;
import lombok.RequiredArgsConstructor;
import lombok.SneakyThrows;
import lombok.experimental.FieldDefaults;
import lombok.extern.slf4j.Slf4j;
@Slf4j
@Service
@RequiredArgsConstructor
@FieldDefaults(level = AccessLevel.PRIVATE, makeFinal = true)
public class TableExtractionService {
public static final int MAX_ROWS_OR_COLS = 500;
public static final int MAX_CELLS = MAX_ROWS_OR_COLS * MAX_ROWS_OR_COLS;
BlockificationService blockificationService;
ReadingOrderService readingOrderService;
static int MIN_TABLE_CONTAINED_CELLS_WITH_TEXT = 1;
static double TABLE_UNIFORMITY_THRESHOLD = 0.7;
public List<TablePageBlock> extractTables(List<Cell> emptyCells,
List<Word> words,
PageInformation pageInformation,
List<Table> idpTables,
LayoutParsingType layoutParsingType,
LayoutDebugLayer layoutDebugLayer) {
AffineTransform pdfToPageTransform = CoordinateTransforms.calculateInitialUserSpaceCoordsToPageCoords(pageInformation);
List<TablePageBlock> tablePageBlocks;
if (idpTables == null || idpTables.isEmpty()) {
tablePageBlocks = extractTables(emptyCells, words, pdfToPageTransform, layoutParsingType, layoutDebugLayer, pageInformation);
} else {
tablePageBlocks = buildTableFromIdpResult(idpTables, words, pdfToPageTransform, layoutParsingType);
}
return tablePageBlocks;
}
private List<TablePageBlock> extractTables(List<Cell> emptyCells,
List<Word> words,
AffineTransform pdfToPageTransform,
LayoutParsingType layoutParsingType,
LayoutDebugLayer layoutDebugLayer,
PageInformation pageInformation) {
// sort cells by size (height * width) ascending so that textBlocks are always assigned to the smallest cells that contain them
emptyCells.sort(CELL_SIZE_COMPARATOR);
List<Cell> cells = new ArrayList<>(new HashSet<>(emptyCells));
DoubleComparisons.sort(cells, GeometricComparators.CELL_SORTER);
List<Rectangle2D> spreadsheetAreas = SpreadsheetFinder.findSpreadsheetsFromCells(cells);
// sort spreadsheetAreas by size (height * width) ascending so that cells are placed in the smallest tables first
// this way no cell duplication occurs when tables are contained in other tables and only the most inner table contains the cells
spreadsheetAreas.sort(RECTANGLE_SIZE_COMPARATOR);
List<TablePageBlock> tables = new ArrayList<>();
for (Rectangle2D area : spreadsheetAreas) {
List<Cell> containedCells = new ArrayList<>();
for (Cell cell : cells) {
if (cell.hasMinimumSize() && area.contains(cell.getBBoxPdf())) {
containedCells.add(cell);
}
}
if (containedCells.isEmpty()) {
continue;
}
// if cells are missing, for example a corner hasn't been recognized (See files/syngenta/CustomerFiles/SinglePages/T4_Page16_138 IDD0000261736.pdf),
// the LinkedCell based gridification can deal with this, but the transpose logic will then drop the entire column.
// That's why we compute the missing Cells from the spreadsheet area and fill them in.
Set<Cell> missingCells = TableAreaFiller.findMissingCells(containedCells, area, pdfToPageTransform);
layoutDebugLayer.addCellVisualizations(missingCells, pageInformation.number(), Color.RED);
layoutDebugLayer.addCellVisualizations(List.of(new Cell(area, pdfToPageTransform)), pageInformation.number(), Color.BLUE);
containedCells.addAll(missingCells);
Set<Word> wordsInTable = new HashSet<>(); // As docstrum blockfication recomputes the words, we need to remember the origin words to remove them from the overall list of words
for (Cell cell : containedCells) {
Function<Point2D, Boolean> contains = p -> cell.getBBoxPdf().contains(p);
Function<Rectangle2D, Boolean> containsRect = r -> cell.getBBoxPdf().contains(r);
BlocksWithTheirWords blocksWithTheirWords = sortBlocksIntoCell(layoutParsingType, words, tables, contains, containsRect);
cell.setTextBlocks(blocksWithTheirWords.blocks());
wordsInTable.addAll(blocksWithTheirWords.words());
}
if (containedCells.size() > MAX_CELLS) {
continue;
}
var containedCellsWithText = containedCells.stream()
.filter(cell -> !cell.getTextBlocks().isEmpty())
.toList();
// verify if table would contain fewer cells with text than the threshold allows
if (containedCellsWithText.size() >= MIN_TABLE_CONTAINED_CELLS_WITH_TEXT && checkIfTableCellsAreUniform(containedCells)) {
TablePageBlock tablePageBlock = new TableFromCellsExtractor(containedCells, pdfToPageTransform).extract();
cells.removeAll(containedCells);
addTableIfValid(words, tablePageBlock, tables, wordsInTable);
}
}
return tables;
}
private static void removeWordsFromCells(List<Word> words, TablePageBlock tablePageBlock) {
Set<Word> wordsFromCells = new HashSet<>(tablePageBlock.getWords());
words.removeAll(wordsFromCells);
}
private List<TablePageBlock> buildTableFromIdpResult(List<Table> idpTables, List<Word> words, AffineTransform pdfToPageTransform, LayoutParsingType layoutParsingType) {
if (idpTables == null || idpTables.isEmpty()) {
return Collections.emptyList();
}
List<TablePageBlock> tables = new ArrayList<>();
for (Table idpTable : idpTables) {
if (idpTable.bboxes().size() != 1) {
// Should never happen, as IDP still looks at pages individually. (I think so, at least 😅)
log.error("IDP Table on multiple pages are not handled yet!");
continue;
}
List<Cell> cells = new ArrayList<>(idpTable.cells().size());
Set<Word> wordsInTable = new HashSet<>(); // As docstrum blockfication recomputes the words, we need to remember the origin words to remove them from the overall list of words
for (TableCell idpCell : idpTable.cells()) {
Cell cell = new Cell(idpCell, pdfToPageTransform);
if (idpCell.kind().equals(TableCellType.ROW_HEADER) || idpCell.kind().equals(TableCellType.COLUMN_HEADER)) {
cell.setHeaderCell(true);
}
cells.add(cell);
Function<Point2D, Boolean> contains = p -> idpCell.textRegion().region().bbox().get().contains(p);
Function<Rectangle2D, Boolean> containsRect = r -> idpCell.textRegion().region().bbox().get().contains(r);
BlocksWithTheirWords blocksWithTheirWords = sortBlocksIntoCell(layoutParsingType, words, tables, contains, containsRect);
cell.setTextBlocks(blocksWithTheirWords.blocks);
wordsInTable.addAll(blocksWithTheirWords.words());
}
TableGridStructureCalculator calculator = new TableGridStructureCalculator(cells, pdfToPageTransform);
List<List<Cell>> gridCells = calculator.gridify();
TablePageBlock tablePageBlock = new TablePageBlock(null, gridCells);
addTableIfValid(words, tablePageBlock, tables, wordsInTable);
}
return tables;
}
private static void addTableIfValid(List<Word> words, TablePageBlock tablePageBlock, List<TablePageBlock> tables, Set<Word> wordsInTable) {
if (tablePageBlock.getRowCount() > MAX_ROWS_OR_COLS || tablePageBlock.getColCount() == 0 || tablePageBlock.getColCount() > MAX_ROWS_OR_COLS) {
return;
}
words.removeAll(wordsInTable);
tables.add(tablePageBlock);
}
private BlocksWithTheirWords sortBlocksIntoCell(LayoutParsingType layoutParsingType,
List<Word> words,
List<TablePageBlock> tables,
Function<Point2D, Boolean> contains,
Function<Rectangle2D, Boolean> containsRect) {
List<Word> wordsInCell = new LinkedList<>();
for (Word word : words) {
Rectangle2D bBoxPdf = word.getBBoxPdf();
if (!contains.apply(new Point2D.Double(bBoxPdf.getCenterX(), bBoxPdf.getCenterY()))) {
continue;
}
wordsInCell.add(word);
}
List<TextPageBlock> textBlocks = blockificationService.blockify(layoutParsingType, wordsInCell, CleanRulings.empty(), null);
List<TablePageBlock> tablesInCell = new LinkedList<>();
for (TablePageBlock table : tables) {
if (containsRect.apply(table.getBBoxPdf())) {
tablesInCell.add(table);
}
}
var blocks = readingOrderService.resolve(textBlocks, tablesInCell);
return new BlocksWithTheirWords(blocks, wordsInCell);
}
private boolean checkIfTableCellsAreUniform(List<Cell> containedCells) {
if (containedCells.size() <= 2) {
return true;
}
Map<Long, List<Long>> cellsGroupedByRoundedWidth = containedCells.stream()
.map(BoundingBox::getWidth)
.map(size -> Math.round(size / 10.0) * 10)
.collect(Collectors.groupingBy(Long::longValue));
return (double) cellsGroupedByRoundedWidth.size() / containedCells.size() <= TABLE_UNIFORMITY_THRESHOLD;
}
@SneakyThrows
public static List<Cell> findCells(List<Ruling> horizontalRulingLines, List<Ruling> verticalRulingLines, PageInformation pageInformation) {
var solidHorizontalRulingLines = horizontalRulingLines.stream()
.filter(r -> !Objects.equals(Ruling.Style.DASHED, r.getStyle()))
.toList();
var solidVerticalRulingLines = verticalRulingLines.stream()
.filter(r -> !Objects.equals(Ruling.Style.DASHED, r.getStyle()))
.toList();
AffineTransform affineTransform = CoordinateTransforms.calculateInitialUserSpaceCoordsToPageCoords(pageInformation);
return RectangularIntersectionFinder.find(solidHorizontalRulingLines, solidVerticalRulingLines)
.stream()
.map(rect -> new Cell(rect, affineTransform))
.collect(Collectors.toList());
}
private record BlocksWithTheirWords(List<AbstractPageBlock> blocks, Collection<Word> words) {
}
}

View File

@ -0,0 +1,133 @@
package com.knecon.fforesight.service.layoutparser.processor.services.tables;
import java.awt.geom.AffineTransform;
import java.util.ArrayList;
import java.util.Collection;
import java.util.LinkedList;
import java.util.List;
import com.fasterxml.jackson.annotation.JsonIgnore;
import com.knecon.fforesight.service.layoutparser.processor.model.PageBlockType;
import com.knecon.fforesight.service.layoutparser.processor.model.table.Cell;
import com.knecon.fforesight.service.layoutparser.processor.model.table.TablePageBlock;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
import lombok.Getter;
import lombok.Setter;
import lombok.extern.slf4j.Slf4j;
@Slf4j
public class TableFromCellsExtractor {
@JsonIgnore
protected PageBlockType classification;
private List<List<Cell>> rows;
@Getter
@Setter
private final List<Cell> originCells;
private final AffineTransform pdfToPageTransform;
public TableFromCellsExtractor(List<Cell> originCells, AffineTransform pdfToPageTransform) {
classification = PageBlockType.TABLE;
this.originCells = originCells;
this.pdfToPageTransform = pdfToPageTransform;
}
public TablePageBlock extract() {
computeRows(originCells);
computeHeaders();
return new TablePageBlock(null, rows);
}
/**
* Detect header cells (either first row or first column):
* Column is marked as header if originalCell text is bold and row originalCell text is not bold.
* Defaults to row.
*/
private void computeHeaders() {
// A bold originalCell is a header originalCell as long as every originalCell to the lefts/top is bold, too
// we move from lefts to rights and top to bottom
for (int rowIndex = 0; rowIndex < rows.size(); rowIndex++) {
List<Cell> rowCells = rows.get(rowIndex);
if (rowCells.size() == 1) {
continue;
}
for (int colIndex = 0; colIndex < rowCells.size(); colIndex++) {
Cell cell = rowCells.get(colIndex);
List<Cell> cellsToTheLeft = rowCells.subList(0, colIndex);
Cell lastHeaderCell = null;
for (Cell leftCell : cellsToTheLeft) {
if (leftCell.isHeaderCell()) {
lastHeaderCell = leftCell;
} else {
break;
}
}
if (lastHeaderCell != null) {
cell.getHeaderCells().add(lastHeaderCell);
}
List<Cell> cellsToTheTop = new ArrayList<>();
for (int i = 0; i < rowIndex; i++) {
try {
cellsToTheTop.add(rows.get(i).get(colIndex));
} catch (IndexOutOfBoundsException e) {
log.debug("No cell {} in row {}, ignoring.", colIndex, rowIndex);
}
}
for (Cell topCell : cellsToTheTop) {
if (topCell.isHeaderCell()) {
lastHeaderCell = topCell;
} else {
break;
}
}
if (lastHeaderCell != null) {
cell.getHeaderCells().add(lastHeaderCell);
}
if (!cell.getTextBlocks().isEmpty() //
&& cell.getTextBlocks().get(0) instanceof TextPageBlock textPageBlock //
&& textPageBlock.getMostPopularWordStyle().equals("bold")) {
cell.setHeaderCell(true);
}
}
}
setFirstRowAsHeaderIfNoneFound(rows);
}
private void setFirstRowAsHeaderIfNoneFound(List<List<Cell>> rows) {
if (rows.isEmpty()) {
return;
}
if (rows.stream()
.flatMap(Collection::stream)
.noneMatch(Cell::isHeaderCell)) {
rows.get(0)
.forEach(cell -> cell.setHeaderCell(true));
}
}
private void computeRows(List<Cell> cells) {
if (cells.isEmpty()) {
return;
}
TableGridStructureCalculator calculator = new TableGridStructureCalculator(cells, pdfToPageTransform);
rows = calculator.gridify();
}
}

View File

@ -0,0 +1,353 @@
package com.knecon.fforesight.service.layoutparser.processor.services.tables;
import java.awt.geom.AffineTransform;
import java.awt.geom.Point2D;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.HashSet;
import java.util.LinkedList;
import java.util.List;
import java.util.Set;
import java.util.stream.Collectors;
import com.knecon.fforesight.service.layoutparser.processor.model.table.Cell;
import lombok.SneakyThrows;
import lombok.extern.slf4j.Slf4j;
@Slf4j
public class TableGridStructureCalculator {
// multiplied with minimum cell height/width, Cells may be at most this apart in one dimension, and must overlap at least that much in the other dimension to be considered neighbours
private static final double DISTANCE_FACTOR = 0.5;
Set<Cell> cells;
AffineTransform pageToPdfTransform;
double minCellHeight;
double minCellWidth;
@SneakyThrows
TableGridStructureCalculator(Collection<Cell> cells, AffineTransform pdfToPageTransform) {
this.cells = new HashSet<>(cells);
this.pageToPdfTransform = pdfToPageTransform.createInverse();
this.minCellHeight = cells.stream()
.mapToDouble(cell -> cell.getBBox().getHeight())
.min().orElse(0);
this.minCellWidth = cells.stream()
.mapToDouble(cell -> cell.getBBox().getWidth())
.min().orElse(0);
}
/**
* Calculates the grid structure of the table. For spanning rows and columns multiple cells with the same values will be inserted.
* Checks if any cell has more than one neighbor in any direction, if it does, it splits the cell according to its neighbors.
* This is repeated until no more splits are necessary. Then the rows are computed using that very same linked neighbor structure starting with the top left cell.
*
* @return TablePageBlock Structure as a rows of cells matrix
*/
public List<List<Cell>> gridify() {
if (cellsHaveLargeOverlaps()) {
// If cells overlap significantly, the logic below will keep splitting them infinitely, so we revert to the simpler area sweep implementation.
List<List<Cell>> rows = AreaSweepGridifier.gridify(cells, pageToPdfTransform, minCellWidth, minCellHeight);
rows = removeEmptyRows(rows);
rows = removeEmptyCols(rows);
return rows;
}
var linkedCells = cells.stream()
.map(LinkedCell::new)
.collect(Collectors.toList());
computeNeighbours(linkedCells);
while (linkedCells.stream()
.anyMatch(LinkedCell::needsSplit)) {
List<LinkedCell> newCells = new LinkedList<>();
for (LinkedCell linkedCell : linkedCells) {
if (linkedCell.needsSplit()) {
newCells.addAll(linkedCell.split());
} else {
newCells.add(linkedCell);
}
}
computeNeighbours(newCells);
linkedCells = newCells;
}
return buildStructure(linkedCells);
}
private boolean cellsHaveLargeOverlaps() {
for (Cell cell1 : cells) {
for (Cell cell2 : cells) {
if (cell1.equals(cell2)) {
continue;
}
if (cell1.horizontalOverlap(cell2) > minCellWidth * DISTANCE_FACTOR //
&& cell1.verticalOverlap(cell2) > minCellHeight * DISTANCE_FACTOR) {
return true;
}
}
}
return false;
}
private List<List<Cell>> buildStructure(List<LinkedCell> cells) {
if (cells.isEmpty()) {
return Collections.emptyList();
}
List<List<Cell>> rows = buildRows(cells);
if (isNotRectangular(rows)) {
throw new AssertionError();
}
rows = removeEmptyRows(rows);
rows = removeEmptyCols(rows);
return rows;
}
private boolean isNotRectangular(List<List<Cell>> rows) {
if (rows.isEmpty()) {
return true;
}
int n = rows.get(0).size();
return rows.stream()
.anyMatch(row -> row.size() != n);
}
private List<List<Cell>> buildRows(List<LinkedCell> cells) {
List<LinkedCell> topLeftCandidates = cells.stream()
.filter(LinkedCell::isTopLeft)
.toList();
assert topLeftCandidates.size() == 1;
var cell = topLeftCandidates.get(0);
List<List<Cell>> rows = new ArrayList<>();
rows.add(buildRow(cell));
while (!cell.belows.isEmpty()) {
cell = cell.belows.get(0);
rows.add(buildRow(cell));
}
if (isNotRectangular(rows)) {
throw new AssertionError();
}
return rows;
}
private static List<Cell> buildRow(LinkedCell cell) {
List<Cell> currentRow = new ArrayList<>();
LinkedCell nextCell = cell;
currentRow.add(cell.originalCell);
while (!nextCell.rights.isEmpty()) {
nextCell = nextCell.rights.get(0);
currentRow.add(nextCell.originalCell);
}
return currentRow;
}
private void computeNeighbours(List<LinkedCell> cells) {
for (LinkedCell cell : cells) {
cell.resetNeighbours();
computeNeighbours(cell, cells);
}
}
private void computeNeighbours(LinkedCell cell, List<LinkedCell> otherCells) {
for (LinkedCell otherCell : otherCells) {
if (cell.equals(otherCell)) {
continue;
}
if (cell.originalCell.horizontalDistance(otherCell.originalCell) <= minCellWidth * DISTANCE_FACTOR
&& cell.originalCell.verticalOverlap(otherCell.originalCell) >= minCellHeight * DISTANCE_FACTOR) {
if (cell.originalCell.getBBox().getCenterX() <= otherCell.originalCell.getBBox().getCenterX()) {
cell.rights.add(otherCell);
} else {
cell.lefts.add(otherCell);
}
} else if (cell.originalCell.verticalDistance(otherCell.originalCell) <= minCellHeight * DISTANCE_FACTOR
&& cell.originalCell.horizontalOverlap(otherCell.originalCell) >= minCellWidth * DISTANCE_FACTOR) {
if (cell.originalCell.getBBox().getCenterY() <= otherCell.originalCell.getBBox().getCenterY()) {
cell.belows.add(otherCell);
} else {
cell.aboves.add(otherCell);
}
}
}
}
static <T> List<List<T>> transpose(List<List<T>> table) {
List<List<T>> ret = new ArrayList<List<T>>();
final int N = table.get(0).size();
for (int i = 0; i < N; i++) {
List<T> col = new ArrayList<T>();
for (List<T> row : table) {
col.add(row.get(i));
}
ret.add(col);
}
return ret;
}
private List<List<Cell>> removeEmptyCols(List<List<Cell>> rowsOfCells) {
if (rowsOfCells.isEmpty()) {
return rowsOfCells;
}
var colsOfCells = transpose(rowsOfCells);
colsOfCells = removeEmptyRows(colsOfCells);
return transpose(colsOfCells);
}
private List<List<Cell>> removeEmptyRows(List<List<Cell>> rowsOfCells) {
return rowsOfCells.stream()
.filter(row -> row.stream()
.anyMatch(cell -> !cell.getTextBlocks().isEmpty()))
.collect(Collectors.toList());
}
class LinkedCell {
private final Cell originalCell;
private final List<LinkedCell> rights;
private final List<LinkedCell> lefts;
private final List<LinkedCell> aboves;
private final List<LinkedCell> belows;
LinkedCell(Cell cell) {
this.originalCell = cell;
this.rights = new LinkedList<>();
this.lefts = new LinkedList<>();
this.aboves = new LinkedList<>();
this.belows = new LinkedList<>();
}
public boolean needsSplit() {
return rights.size() > 1 || lefts.size() > 1 || aboves.size() > 1 || belows.size() > 1;
}
public boolean isTopLeft() {
return lefts.isEmpty() && aboves.isEmpty();
}
public String toString() {
return originalCell.toString();
}
public Collection<LinkedCell> split() {
if (rights.size() > 1 && rights.size() >= lefts.size()) {
return splitY(rights);
}
if (lefts.size() > 1) {
return splitY(lefts);
}
if (aboves.size() > 1 && aboves.size() >= belows.size()) {
return splitX(aboves);
}
if (belows.size() > 1) {
return splitX(belows);
}
return List.of(this);
}
private List<LinkedCell> splitY(List<LinkedCell> neighbours) {
List<LinkedCell> splitCells = new LinkedList<>();
List<Double> ySplit = neighbours.stream()
.map(right -> right.originalCell.getMaxY())
.sorted()
.toList();
Point2D topLeft = new Point2D.Double(originalCell.getBBox().getMinX(), originalCell.getBBox().getMinY());
double maxX = originalCell.getBBox().getMaxX();
double x = originalCell.getBBox().getX();
double maxY = originalCell.getBBox().getMaxY();
for (Double neighborY : ySplit) {
double y = Math.min(neighborY, maxY);
Point2D bottomRight = new Point2D.Double(maxX, y);
Cell cell = copyCell(topLeft, bottomRight);
splitCells.add(new LinkedCell(cell));
topLeft = new Point2D.Double(x, y);
}
return splitCells;
}
private List<LinkedCell> splitX(List<LinkedCell> neighbours) {
List<LinkedCell> splitCells = new LinkedList<>();
List<Double> xSplit = neighbours.stream()
.map(right -> right.originalCell.getMaxX())
.sorted()
.toList();
Point2D topLeft = new Point2D.Double(originalCell.getBBox().getMinX(), originalCell.getBBox().getMinY());
double maxY = originalCell.getBBox().getMaxY();
double y = originalCell.getBBox().getY();
double maxX = originalCell.getBBox().getMaxX();
for (Double neighborX : xSplit) {
double x = Math.min(neighborX, maxX);
Point2D bottomRight = new Point2D.Double(x, maxY);
Cell cell = copyCell(topLeft, bottomRight);
splitCells.add(new LinkedCell(cell));
topLeft = new Point2D.Double(x, y);
}
return splitCells;
}
private Cell copyCell(Point2D topLeft, Point2D bottomRight) {
Cell cell = Cell.fromPageCoordinates(topLeft, bottomRight, pageToPdfTransform);
cell.setHeaderCell(originalCell.isHeaderCell());
cell.setTextBlocks(originalCell.getTextBlocks());
return cell;
}
public void resetNeighbours() {
rights.clear();
lefts.clear();
aboves.clear();
belows.clear();
}
}
}

View File

@ -1,4 +1,4 @@
package com.knecon.fforesight.service.layoutparser.processor.utils;
package com.knecon.fforesight.service.layoutparser.processor.services.tables;
import java.util.Collection;
import java.util.Collections;

View File

@ -0,0 +1,113 @@
package com.knecon.fforesight.service.layoutparser.processor.services.visualization;
import java.awt.Color;
import java.util.Objects;
import com.knecon.fforesight.service.ocr.v1.api.model.Figure;
import com.knecon.fforesight.service.ocr.v1.api.model.IdpResult;
import com.knecon.fforesight.service.ocr.v1.api.model.KeyValuePair;
import com.knecon.fforesight.service.ocr.v1.api.model.QuadPoint;
import com.knecon.fforesight.service.ocr.v1.api.model.Region;
import com.knecon.fforesight.service.ocr.v1.api.model.Table;
import com.knecon.fforesight.service.ocr.v1.api.model.TableCell;
import com.knecon.fforesight.service.ocr.v1.api.model.TableCellType;
import com.knecon.fforesight.service.ocr.v1.api.model.TextRegion;
import com.knecon.fforesight.service.viewerdoc.layers.IdpLayerConfig;
import com.knecon.fforesight.service.viewerdoc.model.ColoredLine;
import com.knecon.fforesight.service.viewerdoc.model.FilledRectangle;
import com.knecon.fforesight.service.viewerdoc.model.Visualizations;
import lombok.AccessLevel;
import lombok.experimental.FieldDefaults;
@FieldDefaults(level = AccessLevel.PRIVATE, makeFinal = true)
public class IdpResultLayer extends IdpLayerConfig {
public static final int LINE_WIDTH = 1;
public IdpResultLayer(IdpResult result) {
result.tables()
.forEach(this::addTable);
result.keyValuePairs()
.forEach(this::addKeyValue);
result.figures()
.forEach(this::addFigure);
}
private void addFigure(Figure figure) {
addRegion(figure.image(), figures, IMAGE_COLOR);
if (figure.caption() != null) {
addRegion(figure.caption().region(), figures, IMAGE_COLOR);
}
}
private void addTable(Table table) {
for (Region bbox : table.bboxes()) {
addRegion(bbox, tables, TABLE_COLOR);
}
for (TableCell cell : table.cells()) {
addRegion(cell.textRegion().region(), tables, INNER_LINES_COLOR);
if (Objects.equals(cell.kind(), TableCellType.ROW_HEADER) || Objects.equals(cell.kind(), TableCellType.COLUMN_HEADER)) {
addRegionAsFilledRect(cell.textRegion().region(), tables, HEADER_CELL_COLOR);
}
}
if (table.caption() != null) {
addRegion(table.caption().region(), tables, TABLE_COLOR);
}
for (TextRegion footnote : table.footnotes()) {
addRegion(footnote.region(), tables, FOOTNOTE_COLOR);
}
}
private void addQuadPoint(int pageNumber, QuadPoint bbox, Visualizations vis, Color color) {
var visOnPage = getOrCreateVisualizationsOnPage(pageNumber, vis);
bbox.asLines()
.forEach(line -> visOnPage.getColoredLines().add(new ColoredLine(line, color, LINE_WIDTH)));
}
private void addRegion(Region region, Visualizations vis, Color color) {
var sectionsOnPage = getOrCreateVisualizationsOnPage(region.pageNumber(), vis);
region.bbox().get().asLines()
.forEach(line -> sectionsOnPage.getColoredLines().add(new ColoredLine(line, color, LINE_WIDTH)));
}
private void addRegionAsFilledRect(Region region, Visualizations vis, Color color) {
var sectionsOnPage = getOrCreateVisualizationsOnPage(region.pageNumber(), vis);
sectionsOnPage.getFilledRectangles().add(new FilledRectangle(region.bbox().get().getBounds2D(), color, 0.2f));
}
public void addKeyValue(KeyValuePair keyValue) {
if (keyValue.key() != null) {
addRegion(keyValue.key().region(), keyValuePairs, KEY_COLOR);
}
if (keyValue.value() != null) {
addRegion(keyValue.value().region(), keyValuePairs, VALUE_COLOR);
}
if (keyValue.key() != null && keyValue.value() != null) {
QuadPoint key = keyValue.key().region().bbox().get();
QuadPoint value = keyValue.value().region().bbox().get();
var line = LineUtils.findClosestMidpointLine(key, value);
var arrowHead = LineUtils.createArrowHead(line, Math.min(LineUtils.length(line), 5));
var linesOnPage = getOrCreateVisualizationsOnPage(keyValue.key().region().pageNumber(), keyValuePairs).getColoredLines();
linesOnPage.add(new ColoredLine(line, KEY_VALUE_BBOX_COLOR, LINE_WIDTH));
linesOnPage.add(new ColoredLine(arrowHead[0], KEY_VALUE_BBOX_COLOR, LINE_WIDTH));
linesOnPage.add(new ColoredLine(arrowHead[1], KEY_VALUE_BBOX_COLOR, LINE_WIDTH));
}
}
}

View File

@ -1,6 +1,7 @@
package com.knecon.fforesight.service.layoutparser.processor.services.visualization;
import java.io.File;
import java.util.LinkedList;
import java.util.List;
import org.springframework.stereotype.Service;
@ -14,6 +15,7 @@ import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsi
import com.knecon.fforesight.service.layoutparser.processor.model.DocumentWithVisualization;
import com.knecon.fforesight.service.layoutparser.processor.services.mapper.OutlineMapper;
import com.knecon.fforesight.service.layoutparser.processor.visualization.LayoutGrid;
import com.knecon.fforesight.service.viewerdoc.layers.LayerGroup;
import com.knecon.fforesight.service.viewerdoc.model.Outline;
import com.knecon.fforesight.service.viewerdoc.service.PDFTronViewerDocumentService;
@ -48,16 +50,15 @@ public class LayoutGridService {
document.layoutDebugLayer().addSentenceVisualization(document.document().getTextBlock());
document.layoutDebugLayer().addOutlineHeadlines(document.document());
List<LayerGroup> layers = new LinkedList<>();
layers.add(layoutGrid);
if (document.layoutDebugLayer().isActive()) {
viewerDocumentService.addLayerGroups(originFile,
destinationFile,
List.of(layoutGrid, document.layoutDebugLayer()),
layoutParserVersion,
layoutParsingTypeName,
outline);
} else {
viewerDocumentService.addLayerGroups(originFile, destinationFile, List.of(layoutGrid), layoutParserVersion, layoutParsingTypeName, outline);
layers.add(document.layoutDebugLayer());
}
viewerDocumentService.addLayerGroups(originFile, destinationFile, layers, layoutParserVersion, layoutParsingTypeName, outline);
}

View File

@ -0,0 +1,125 @@
package com.knecon.fforesight.service.layoutparser.processor.services.visualization;
import java.awt.Color;
import java.awt.geom.AffineTransform;
import java.awt.geom.Line2D;
import java.awt.geom.Point2D;
import java.util.List;
import com.knecon.fforesight.service.ocr.v1.api.model.QuadPoint;
import com.knecon.fforesight.service.viewerdoc.model.ColoredLine;
import lombok.experimental.UtilityClass;
@UtilityClass
public class LineUtils {
public List<ColoredLine> quadPointAsLines(QuadPoint rect, boolean tight) {
if (tight) {
return List.of(new ColoredLine(new Line2D.Double(rect.a(), rect.b()), Color.GREEN, 1),
new ColoredLine(new Line2D.Double(rect.b(), rect.c()), Color.GREEN, 1),
new ColoredLine(new Line2D.Double(rect.c(), rect.d()), Color.GREEN, 1),
new ColoredLine(new Line2D.Double(rect.d(), rect.a()), Color.GREEN, 1));
}
return List.of(new ColoredLine(new Line2D.Double(rect.a(), rect.b()), Color.BLUE, 1),
new ColoredLine(new Line2D.Double(rect.b(), rect.c()), Color.BLUE, 1),
new ColoredLine(new Line2D.Double(rect.c(), rect.d()), Color.BLUE, 1),
new ColoredLine(new Line2D.Double(rect.d(), rect.a()), Color.BLUE, 1));
}
public List<ColoredLine> quadPointAsLines(QuadPoint rect, Color color) {
return List.of(new ColoredLine(new Line2D.Double(rect.a(), rect.b()), color, 1),
new ColoredLine(new Line2D.Double(rect.b(), rect.c()), color, 1),
new ColoredLine(new Line2D.Double(rect.c(), rect.d()), color, 1),
new ColoredLine(new Line2D.Double(rect.d(), rect.a()), color, 1));
}
public static Line2D transform(Line2D line2D, AffineTransform affineTransform) {
var p1 = affineTransform.transform(line2D.getP1(), null);
var p2 = affineTransform.transform(line2D.getP2(), null);
return new Line2D.Double(p1, p2);
}
public static double length(Line2D line2D) {
return line2D.getP1().distance(line2D.getP2());
}
public static Line2D findClosestMidpointLine(QuadPoint quad1, QuadPoint quad2) {
List<Line2D> lines1 = quad1.asLines()
.toList();
List<Line2D> lines2 = quad2.asLines()
.toList();
Line2D closestLine1 = null;
Line2D closestLine2 = null;
double minDistance = Double.MAX_VALUE;
for (Line2D line1 : lines1) {
for (Line2D line2 : lines2) {
double distance = lineDistance(line1, line2);
if (distance < minDistance) {
minDistance = distance;
closestLine1 = line1;
closestLine2 = line2;
}
}
}
if (closestLine1 == null || closestLine2 == null) {
throw new IllegalStateException("Could not find closest lines");
}
Point2D midpoint1 = getMidpoint(closestLine1);
Point2D midpoint2 = getMidpoint(closestLine2);
return new Line2D.Double(midpoint1, midpoint2);
}
private static double lineDistance(Line2D line1, Line2D line2) {
return Math.abs(getMidpoint(line1).distance(getMidpoint(line2)));
}
private static Point2D getMidpoint(Line2D line) {
double x = (line.getX1() + line.getX2()) / 2;
double y = (line.getY1() + line.getY2()) / 2;
return new Point2D.Double(x, y);
}
public static Line2D[] createArrowHead(Line2D line, double arrowLength) {
Point2D start = line.getP1();
Point2D end = line.getP2();
// Calculate the angle of the line
double angle = Math.atan2(end.getY() - start.getY(), end.getX() - start.getX());
// Calculate the points for the two arrow lines
double arrowHeadAngle = Math.PI / 6;
double x1 = end.getX() - arrowLength * Math.cos(angle - arrowHeadAngle);
double y1 = end.getY() - arrowLength * Math.sin(angle - arrowHeadAngle);
double x2 = end.getX() - arrowLength * Math.cos(angle + arrowHeadAngle);
double y2 = end.getY() - arrowLength * Math.sin(angle + arrowHeadAngle);
// Create and return the two arrow lines
Line2D arrow1 = new Line2D.Double(end, new Point2D.Double(x1, y1));
Line2D arrow2 = new Line2D.Double(end, new Point2D.Double(x2, y2));
return new Line2D[]{arrow1, arrow2};
}
}

View File

@ -1,34 +0,0 @@
package com.knecon.fforesight.service.layoutparser.processor.utils;
import java.awt.geom.Rectangle2D;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.stream.Collectors;
import com.iqser.red.service.redaction.v1.server.model.document.nodes.Page;
import lombok.experimental.UtilityClass;
@UtilityClass
public class BBoxMergingUtility {
public Map<Page, Rectangle2D> mergeBBoxes(List<Map<Page, Rectangle2D>> bboxesToMerge) {
Map<Page, Rectangle2D> bBoxPerPage = new HashMap<>();
Set<Page> pages = bboxesToMerge.stream()
.flatMap(map -> map.keySet()
.stream())
.collect(Collectors.toSet());
for (Page page : pages) {
Rectangle2D bBoxOnPage = bboxesToMerge.stream()
.filter(childBboxPerPage -> childBboxPerPage.containsKey(page))
.map(childBboxPerPage -> childBboxPerPage.get(page))
.collect(RectangleTransformations.collectBBox());
bBoxPerPage.put(page, bBoxOnPage);
}
return bBoxPerPage;
}
}

View File

@ -2,6 +2,8 @@ package com.knecon.fforesight.service.layoutparser.processor.utils;
import java.awt.geom.AffineTransform;
import com.knecon.fforesight.service.layoutparser.processor.model.PageInformation;
import lombok.SneakyThrows;
import lombok.experimental.UtilityClass;

View File

@ -4,12 +4,14 @@ import java.awt.geom.Point2D;
import java.awt.geom.Rectangle2D;
import java.util.Comparator;
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.BoundingBox;
import com.knecon.fforesight.service.layoutparser.processor.model.table.Cell;
import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling;
public class GeometricComparators {
private static final int COMPARATOR_ROUNDING = 2;
static final float VERTICAL_COMPARISON_THRESHOLD = 0.4f;
public static final Comparator<Point2D> X_FIRST_POINT_COMPARATOR = (point1, point2) -> {
@ -58,6 +60,17 @@ public class GeometricComparators {
return cell1Size.compareTo(cell2Size);
};
public static final Comparator<BoundingBox> CELL_SORTER = (o1, o2) -> {
if (o1.equals(o2)) {
return 0;
}
if (o1.verticalOverlapPdf(o2) > VERTICAL_COMPARISON_THRESHOLD * ((o1.getHeight() + o2.getHeight()) / 2)) {
return Double.compare(o1.getMinX(), o2.getMinX());
} else {
return Double.compare(o1.getMaxY(), o2.getMaxY());
}
};
public static final Comparator<Rectangle2D> RECTANGLE_SIZE_COMPARATOR = (rect1, rect2) -> {
Double rect1Size = rect1.getHeight() * rect1.getWidth();

View File

@ -1,59 +0,0 @@
package com.knecon.fforesight.service.layoutparser.processor.utils;
import java.awt.geom.Rectangle2D;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.common.PDRectangle;
import com.iqser.red.service.redaction.v1.server.model.document.nodes.Page;
public record PageInformation(Rectangle2D mediabox, int number, int rotationDegrees) {
public static PageInformation fromPDPage(int pageNum, PDPage page) {
PDRectangle mediaBox = page.getMediaBox();
return new PageInformation(new Rectangle2D.Double(mediaBox.getLowerLeftX(), mediaBox.getLowerLeftY(), mediaBox.getWidth(), mediaBox.getHeight()),
pageNum,
page.getRotation());
}
public static PageInformation fromPage(Page page) {
return new PageInformation(new Rectangle2D.Double(0, 0, page.getWidth(), page.getHeight()), page.getNumber(), page.getRotation());
}
public double height() {
return mediabox.getHeight();
}
public double heightRot() {
if (rotationDegrees == 90 || rotationDegrees == 270) {
return width();
}
return height();
}
public double width() {
return mediabox.getWidth();
}
public double minX() {
return mediabox.getX();
}
public double minY() {
return mediabox.getY();
}
}

View File

@ -1,42 +0,0 @@
package com.knecon.fforesight.service.layoutparser.processor.utils;
import java.io.BufferedOutputStream;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.util.zip.GZIPOutputStream;
import com.google.protobuf.Message;
import com.google.protobuf.MessageOrBuilder;
import com.google.protobuf.Struct;
import com.google.protobuf.util.JsonFormat;
import lombok.SneakyThrows;
import lombok.experimental.UtilityClass;
@UtilityClass
public class ProtobufUtil {
public static String toJson(MessageOrBuilder messageOrBuilder) throws IOException {
return JsonFormat.printer().print(messageOrBuilder);
}
@SuppressWarnings("unchecked")
public static Message fromJson(String json) throws IOException {
Message.Builder structBuilder = Struct.newBuilder();
JsonFormat.parser().ignoringUnknownFields().merge(json, structBuilder);
return structBuilder.build();
}
@SneakyThrows
public <T extends Message> File serializeToTempFile(T any) {
var tempFile = File.createTempFile("storage-protobuf", ".data");
try (var fos = new GZIPOutputStream(new BufferedOutputStream(new FileOutputStream(tempFile)))) {
any.writeTo(fos);
return tempFile;
}
}
}

View File

@ -2,10 +2,12 @@ package com.knecon.fforesight.service.layoutparser.processor.utils;
import static java.lang.String.format;
import java.awt.geom.AffineTransform;
import java.awt.geom.Point2D;
import java.awt.geom.Rectangle2D;
import java.awt.geom.RectangularShape;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.LinkedList;
import java.util.List;
@ -125,7 +127,7 @@ public class RectangleTransformations {
}
public static Rectangle2D rectangle2DBBox(List<Rectangle2D> rectangle2DList) {
public static Rectangle2D rectangle2DBBox(Collection<Rectangle2D> rectangle2DList) {
return rectangle2DList.stream()
.collect(new Rectangle2DBBoxCollector());
@ -185,6 +187,12 @@ public class RectangleTransformations {
}
public static Rectangle2D transform(Rectangle2D rect, AffineTransform transform) {
return transform.createTransformedShape(rect).getBounds2D();
}
private static class Rectangle2DBBoxCollector implements Collector<Rectangle2D, Rectangle2DBBoxCollector.BBox, Rectangle2D> {
@Override

View File

@ -13,7 +13,7 @@ import java.util.stream.Collectors;
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.AngleFilter;
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.Line;
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.TextBoundingBox;
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.UnionFind;
import com.knecon.fforesight.service.layoutparser.processor.model.UnionFind;
import com.knecon.fforesight.service.layoutparser.processor.docstrum.utils.DoubleUtils;
import com.knecon.fforesight.service.layoutparser.processor.docstrum.utils.FastAtan2;
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
@ -46,6 +46,12 @@ public class TextPositionOperations {
return sortUsingLineDetection(sequences);
}
public List<Word> mergeAndSort(TextPageBlock textBlocks) {
var sequences = new HashSet<>(textBlocks.getWords());
return sortUsingLineDetection(sequences);
}
public List<Word> sort(List<Word> sequences) {

View File

@ -1,44 +0,0 @@
package com.knecon.fforesight.service.layoutparser.processor.utils;
import java.util.HashMap;
import java.util.Map;
// simple implementation of a disjoint-set data structure
// https://en.wikipedia.org/wiki/Disjoint-set_data_structure
public class UnionFind<T> {
Map<T, T> parents = new HashMap<>();
Map<T, Integer> numberOfObjects = new HashMap<>();
public T find(T node) {
if (!parents.containsKey(node)) {
parents.put(node, node);
numberOfObjects.put(node, 1);
}
if (!node.equals(parents.get(node))) {
parents.put(node, find(parents.get(node)));
}
return parents.get(node);
}
public void union(T node1, T node2) {
T root1 = find(node1);
T root2 = find(node2);
if (!root1.equals(root2)) {
if (numberOfObjects.getOrDefault(root1, 1) < numberOfObjects.getOrDefault(root2, 1)) {
parents.put(root1, root2);
numberOfObjects.put(root2, numberOfObjects.get(root2) + numberOfObjects.get(root1));
} else {
parents.put(root2, root1);
numberOfObjects.put(root1, numberOfObjects.get(root1) + numberOfObjects.get(root2));
}
}
}
}

View File

@ -17,7 +17,6 @@ import java.util.concurrent.atomic.AtomicInteger;
import org.apache.pdfbox.pdmodel.documentinterchange.markedcontent.PDMarkedContent;
import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.Rectangle;
import com.iqser.red.service.redaction.v1.server.data.LayoutEngineProto;
import com.iqser.red.service.redaction.v1.server.model.document.TextRange;
import com.iqser.red.service.redaction.v1.server.model.document.nodes.Document;
import com.iqser.red.service.redaction.v1.server.model.document.nodes.LayoutEngine;
@ -36,7 +35,7 @@ import com.knecon.fforesight.service.layoutparser.processor.model.text.Word;
import com.knecon.fforesight.service.layoutparser.processor.services.classification.NumberWord;
import com.knecon.fforesight.service.layoutparser.processor.utils.CoordinateTransforms;
import com.knecon.fforesight.service.layoutparser.processor.utils.MarkedContentUtils;
import com.knecon.fforesight.service.layoutparser.processor.utils.PageInformation;
import com.knecon.fforesight.service.layoutparser.processor.model.PageInformation;
import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations;
import com.knecon.fforesight.service.viewerdoc.layers.LayoutDebugLayerConfig;
import com.knecon.fforesight.service.viewerdoc.model.ColoredLine;
@ -59,7 +58,7 @@ import lombok.experimental.FieldDefaults;
@FieldDefaults(level = AccessLevel.PRIVATE)
public class LayoutDebugLayer extends LayoutDebugLayerConfig {
boolean active;
boolean active = true;
Map<Integer, AtomicInteger> outlineObjectsWithoutPointsPerPage = new HashMap<>();
@ -141,7 +140,7 @@ public class LayoutDebugLayer extends LayoutDebugLayerConfig {
}
public void addCellVisualizations(List<? extends BoundingBox> cells, int pageNumber) {
public void addCellVisualizations(Collection<? extends BoundingBox> cells, int pageNumber, Color color) {
if (!active) {
return;
@ -149,7 +148,7 @@ public class LayoutDebugLayer extends LayoutDebugLayerConfig {
VisualizationsOnPage visualizationsOnPage = getOrCreateVisualizationsOnPage(pageNumber, this.cells);
visualizationsOnPage.getColoredRectangles()
.addAll(cells.stream()
.map(cell -> new ColoredRectangle(cell.getBBoxPdf(), CELLS_COLOR, 1))
.map(cell -> new ColoredRectangle(cell.getBBoxPdf(), color == null ? CELLS_COLOR : color, 1))
.toList());
}
@ -211,7 +210,7 @@ public class LayoutDebugLayer extends LayoutDebugLayerConfig {
}
public void addTextBlockVisualizations(List<AbstractPageBlock> textPageBlocks, int page) {
public void addTextBlockVisualizations(List<? extends AbstractPageBlock> textPageBlocks, int page) {
if (!active) {
return;

View File

@ -26,9 +26,8 @@ import com.iqser.red.service.redaction.v1.server.model.document.nodes.SemanticNo
import com.iqser.red.service.redaction.v1.server.model.document.nodes.SuperSection;
import com.iqser.red.service.redaction.v1.server.model.document.nodes.Table;
import com.iqser.red.service.redaction.v1.server.model.document.nodes.TableCell;
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType;
import com.knecon.fforesight.service.layoutparser.processor.model.PageInformation;
import com.knecon.fforesight.service.layoutparser.processor.utils.CoordinateTransforms;
import com.knecon.fforesight.service.layoutparser.processor.utils.PageInformation;
import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations;
import com.knecon.fforesight.service.viewerdoc.layers.LayoutGridLayerConfig;
import com.knecon.fforesight.service.viewerdoc.model.ColoredLine;
@ -93,19 +92,10 @@ public class LayoutGrid extends LayoutGridLayerConfig {
public void addTreeId(SemanticNode semanticNode) {
Page page = semanticNode.getFirstPage();
if (semanticNode.getBBox()
.get(page) == null) {
if (semanticNode.getBBox().get(page) == null) {
return;
}
addPlacedText(page,
semanticNode.getBBox()
.get(page),
semanticNode.getBBox()
.get(page),
buildTreeIdString(semanticNode),
1,
treeIds,
TREEID_COLOR);
addPlacedText(page, semanticNode.getBBox().get(page), semanticNode.getBBox().get(page), buildTreeIdString(semanticNode), 1, treeIds, TREEID_COLOR);
}
@ -134,8 +124,7 @@ public class LayoutGrid extends LayoutGridLayerConfig {
.toList();
Integer maxChildDepth = subSections.stream()
.map(node -> node.getTreeId().size())
.max(Integer::compareTo)
.orElse(section.getTreeId().size());
.max(Integer::compareTo).orElse(section.getTreeId().size());
int ownDepth = section.getTreeId().size();
Page firstPage = section.getFirstPage();
@ -321,8 +310,7 @@ public class LayoutGrid extends LayoutGridLayerConfig {
Visualizations visualizations = semanticNode.getType().equals(NodeType.TABLE_OF_CONTENTS) ? toc : sections;
List<ColoredLine> coloredLines = getOrCreateVisualizationsOnPage(page.getNumber(), visualizations).getColoredLines();
int lineWidthModifier = maxChildDepth - ownDepth;
Rectangle2D r = RectangleTransformations.pad(semanticNode.getBBox()
.get(page), LINE_WIDTH * (1 + lineWidthModifier), LINE_WIDTH * (1 + lineWidthModifier));
Rectangle2D r = RectangleTransformations.pad(semanticNode.getBBox().get(page), LINE_WIDTH * (1 + lineWidthModifier), LINE_WIDTH * (1 + lineWidthModifier));
SemanticNode highestParent = semanticNode.getHighestParent();
Rectangle2D highestParentRect = rectangleMap.get(new RectangleIdentifier(highestParent.getTreeId(), page.getNumber()));
@ -371,8 +359,7 @@ public class LayoutGrid extends LayoutGridLayerConfig {
List<Double> ys = yStream.collect(Collectors.toList());
ys.remove(0);
Rectangle2D tableBBox = table.getBBox()
.get(page);
Rectangle2D tableBBox = table.getBBox().get(page);
List<ColoredLine> coloredLines = getOrCreateVisualizationsOnPage(page.getNumber(), tables).getColoredLines();
xs.forEach(x -> {

View File

@ -0,0 +1,60 @@
package com.knecon.fforesight.service.layoutparser.processor.services.tables;
import static org.junit.jupiter.api.Assertions.*;
import java.awt.geom.AffineTransform;
import java.awt.geom.Rectangle2D;
import java.util.Iterator;
import java.util.List;
import java.util.Set;
import java.util.stream.Stream;
import org.junit.jupiter.api.Test;
import com.knecon.fforesight.service.layoutparser.processor.model.table.Cell;
class TableAreaFillerTest {
@Test
void findMissingCells() {
Rectangle2D area = new Rectangle2D.Double(0, 0, 2, 2);
List<Rectangle2D> rectangles = List.of(new Rectangle2D.Double(0, 0, 1, 1), new Rectangle2D.Double(1, 1, 1, 1), new Rectangle2D.Double(1, 0, 1, 1));
Set<Rectangle2D> missing = TableAreaFiller.findMissingRects(rectangles, area);
assertEquals(1, missing.size());
assertEquals(new Rectangle2D.Double(0, 1, 1, 1), missing.iterator().next());
}
@Test
void findMissingCells2() {
Rectangle2D area = new Rectangle2D.Double(0, 0, 3, 3);
List<Rectangle2D> rectangles = List.of(new Rectangle2D.Double(0, 0, 1, 1),
new Rectangle2D.Double(1, 0, 1, 1),
new Rectangle2D.Double(2, 0, 1, 1),
new Rectangle2D.Double(0, 1, 1, 1),
new Rectangle2D.Double(1, 1, 1, 1),
new Rectangle2D.Double(2, 1, 1, 1));
var missing = TableAreaFiller.findMissingRects(rectangles, area);
assertEquals(1, missing.size());
assertEquals(new Rectangle2D.Double(0, 2, 3, 1), missing.iterator().next());
}
@Test
void findMissingCells3() {
Rectangle2D area = new Rectangle2D.Double(0, 0, 2, 2);
List<Rectangle2D> rectangles = List.of(new Rectangle2D.Double(0, 0, 1, 1));
Set<Rectangle2D> missing = TableAreaFiller.findMissingRects(rectangles, area);
assertEquals(2, missing.size());
Iterator<Rectangle2D> iterator = missing.iterator();
assertEquals(new Rectangle2D.Double(0, 1, 2, 1), iterator.next());
assertEquals(new Rectangle2D.Double(1, 0, 1, 1), iterator.next());
}
}

View File

@ -75,6 +75,7 @@ public abstract class AbstractTest {
protected final static String TENANT_ID = "tenant";
protected final static String VIEWER_DOCUMENT_ID = "viewer";
protected final static String SIMPLIFIED_ID = "simplified";
protected final static String IDP_ID = "idp";
protected LayoutParsingRequest buildStandardLayoutParsingRequest() {
@ -117,7 +118,14 @@ public abstract class AbstractTest {
public static LayoutParsingRequest buildDefaultLayoutParsingRequest(String fileName, LayoutParsingType layoutParsingType, boolean debug) {
return buildDefaultLayoutParsingRequest(fileName, layoutParsingType, debug, false);
}
public static LayoutParsingRequest buildDefaultLayoutParsingRequest(String fileName, LayoutParsingType layoutParsingType, boolean debug, boolean withIdpResult) {
var identifier = debug ? Map.of("fileId", fileName, "debug", "true") : Map.of("fileId", fileName);
Optional<String> idpResultStorageId = withIdpResult ? Optional.of(fileName + IDP_ID) : Optional.empty();
return LayoutParsingRequest.builder()
.identifier(identifier)
.layoutParsingType(layoutParsingType)
@ -132,6 +140,7 @@ public abstract class AbstractTest {
.simplifiedTextStorageId(fileName + SIMPLIFIED_ID)
.viewerDocumentStorageId(fileName + VIEWER_DOCUMENT_ID)
.documentMarkdownFileStorageId(Optional.of(fileName + MARKDOWN_FILE_ID))
.idpResultStorageId(idpResultStorageId)
.build();
}

View File

@ -34,6 +34,7 @@ import com.knecon.fforesight.service.layoutparser.processor.services.factory.Doc
import com.knecon.fforesight.service.layoutparser.processor.services.mapper.TaasDocumentDataMapper;
import com.knecon.fforesight.service.layoutparser.processor.utils.DrawingOptions;
import com.knecon.fforesight.service.layoutparser.server.utils.visualizations.PdfDraw;
import com.knecon.fforesight.service.ocr.v1.api.model.IdpResult;
import lombok.SneakyThrows;
@ -51,11 +52,12 @@ public class BdrJsonBuildTest extends AbstractTest {
return DocumentGraphFactory.buildDocumentGraph(LayoutParsingType.CLARIFYND,
layoutParsingPipeline.parseLayout(LayoutParsingType.CLARIFYND,
file,
new ImageServiceResponse(),
new TableServiceResponse(),
new VisualLayoutParsingResponse(),
Map.of("file",file.toString()))).document();
file,
new ImageServiceResponse(),
new TableServiceResponse(),
IdpResult.empty(),
new VisualLayoutParsingResponse(),
Map.of("file",file.toString()))).document();
}

View File

@ -16,6 +16,7 @@ import com.knecon.fforesight.service.layoutparser.processor.model.DocumentWithVi
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableServiceResponse;
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.VisualLayoutParsingResponse;
import com.knecon.fforesight.service.layoutparser.processor.services.factory.DocumentGraphFactory;
import com.knecon.fforesight.service.ocr.v1.api.model.IdpResult;
import lombok.SneakyThrows;
@ -39,6 +40,7 @@ public abstract class BuildDocumentTest extends AbstractTest {
fileResource,
layoutParsingStorageService.getImagesFile(IMAGE_FILE_ID),
new TableServiceResponse(),
IdpResult.empty(),
new VisualLayoutParsingResponse(),
Map.of("file", filename, "debug", "true"));
}
@ -63,6 +65,7 @@ public abstract class BuildDocumentTest extends AbstractTest {
layoutParsingStorageService.getImagesFile(layoutParsingRequest.imagesFileStorageId()
.get()),
new TableServiceResponse(),
IdpResult.empty(),
new VisualLayoutParsingResponse(),
layoutParsingRequest.identifier()));
} else {

View File

@ -30,6 +30,7 @@ import com.knecon.fforesight.service.layoutparser.processor.python_api.model.ima
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableServiceResponse;
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.VisualLayoutParsingResponse;
import com.knecon.fforesight.service.layoutparser.processor.services.factory.DocumentGraphFactory;
import com.knecon.fforesight.service.ocr.v1.api.model.IdpResult;
import com.knecon.fforesight.tenantcommons.TenantsClient;
import lombok.AllArgsConstructor;
@ -106,6 +107,7 @@ public class HeadlinesGoldStandardIntegrationTest {
pdfFileResource.getFile(),
new ImageServiceResponse(),
new TableServiceResponse(),
IdpResult.empty(),
new VisualLayoutParsingResponse(),
Map.of("file", filePath))).document();
var foundHeadlines = documentGraph.streamAllSubNodes()

View File

@ -1,6 +1,7 @@
package com.knecon.fforesight.service.layoutparser.server;
import java.io.File;
import java.io.FileInputStream;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.Arrays;
@ -24,6 +25,7 @@ import lombok.SneakyThrows;
import lombok.extern.slf4j.Slf4j;
@Slf4j
@Disabled
public class LayoutparserEnd2EndTest extends AbstractTest {
public static final LayoutParsingType LAYOUT_PARSING_TYPE = LayoutParsingType.DOCUMINE_OLD;
@ -33,15 +35,24 @@ public class LayoutparserEnd2EndTest extends AbstractTest {
@Test
@Disabled
public void testLayoutParserEndToEnd() {
String filePath = "/home/kschuettler/Dokumente/LayoutparsingEvaluation/VV-340050.pdf";
String filePath = "/home/kschuettler/Downloads/2021-2048323.pdf";
runForFile(filePath);
}
@Test
public void testLayoutParserEndToEndWithIdpResult() {
String filePath = "/tmp/OCR_TEST/2009-1048395_50pages_tables.pdf/document.pdf";
String idpResultPath = "/tmp/OCR_TEST/2009-1048395_50pages_tables.pdf/idpResult.json";
runForFile(filePath, idpResultPath);
}
@Test
@Disabled
@SneakyThrows
@ -62,9 +73,15 @@ public class LayoutparserEnd2EndTest extends AbstractTest {
}
@SneakyThrows
private void runForFile(String filePath) {
runForFile(filePath, null);
}
@SneakyThrows
private void runForFile(String filePath, String idpResultPath) {
String fileName = Path.of(filePath).getFileName().toString();
File file;
if (filePath.startsWith("files")) { // from resources
@ -73,7 +90,13 @@ public class LayoutparserEnd2EndTest extends AbstractTest {
file = new File(filePath);
}
LayoutParsingRequest layoutParsingRequest = buildDefaultLayoutParsingRequest(fileName, LAYOUT_PARSING_TYPE, true);
LayoutParsingRequest layoutParsingRequest = buildDefaultLayoutParsingRequest(fileName, LAYOUT_PARSING_TYPE, true, true);
if (layoutParsingRequest.idpResultStorageId().isPresent() && idpResultPath != null) {
try (var in = new FileInputStream(idpResultPath)) {
storageService.storeObject(TENANT_ID, layoutParsingRequest.idpResultStorageId().get(), in);
}
}
prepareStorage(layoutParsingRequest, file);

View File

@ -32,6 +32,7 @@ import com.knecon.fforesight.service.layoutparser.processor.python_api.model.tab
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.VisualLayoutParsingResponse;
import com.knecon.fforesight.service.layoutparser.processor.services.factory.DocumentGraphFactory;
import com.knecon.fforesight.service.layoutparser.processor.services.visualization.LayoutGridService;
import com.knecon.fforesight.service.ocr.v1.api.model.IdpResult;
import com.knecon.fforesight.service.viewerdoc.service.PDFTronViewerDocumentService;
import lombok.SneakyThrows;
@ -192,6 +193,7 @@ public class OutlineDetectionTest extends AbstractTest {
fileResource,
layoutParsingStorageService.getImagesFile(IMAGE_FILE_ID),
new TableServiceResponse(),
IdpResult.empty(),
new VisualLayoutParsingResponse(),
Map.of("file", filename, "debug", "true"));
}
@ -209,6 +211,7 @@ public class OutlineDetectionTest extends AbstractTest {
layoutParsingStorageService.getImagesFile(layoutParsingRequest.imagesFileStorageId()
.get()),
new TableServiceResponse(),
IdpResult.empty(),
new VisualLayoutParsingResponse(),
layoutParsingRequest.identifier()));
} else {

View File

@ -20,6 +20,7 @@ import com.knecon.fforesight.service.layoutparser.processor.python_api.model.tab
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.VisualLayoutParsingResponse;
import com.knecon.fforesight.service.layoutparser.processor.services.SimplifiedSectionTextService;
import com.knecon.fforesight.service.layoutparser.processor.services.factory.DocumentGraphFactory;
import com.knecon.fforesight.service.ocr.v1.api.model.IdpResult;
import lombok.SneakyThrows;
import lombok.extern.slf4j.Slf4j;
@ -61,6 +62,7 @@ public class SimplifiedTextServiceTest extends AbstractTest {
file,
new ImageServiceResponse(),
new TableServiceResponse(),
IdpResult.empty(),
new VisualLayoutParsingResponse(),
Map.of("file", file.toString()))).document();
}

View File

@ -21,6 +21,7 @@ import com.knecon.fforesight.service.layoutparser.processor.python_api.model.tab
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.VisualLayoutParsingResponse;
import com.knecon.fforesight.service.layoutparser.processor.services.factory.DocumentGraphFactory;
import com.knecon.fforesight.service.layoutparser.server.BuildDocumentTest;
import com.knecon.fforesight.service.ocr.v1.api.model.IdpResult;
import lombok.SneakyThrows;
@ -58,11 +59,12 @@ public class DocumentGraphJsonWritingTest extends BuildDocumentTest {
Document documentGraph = DocumentGraphFactory.buildDocumentGraph(LayoutParsingType.REDACT_MANAGER_WITHOUT_DUPLICATE_PARAGRAPH,
layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER_WITHOUT_DUPLICATE_PARAGRAPH,
filename.toFile(),
new ImageServiceResponse(),
new TableServiceResponse(),
new VisualLayoutParsingResponse(),
Map.of("file",filename.toFile().toString()))).document();
filename.toFile(),
new ImageServiceResponse(),
new TableServiceResponse(),
IdpResult.empty(),
new VisualLayoutParsingResponse(),
Map.of("file",filename.toFile().toString()))).document();
DocumentData documentData = DocumentDataMapper.toDocumentData(documentGraph);
ObjectMapper mapper = ObjectMapperFactory.create();

View File

@ -29,7 +29,7 @@ public class DocumentGraphMappingTest extends BuildDocumentTest {
@SneakyThrows
public void testGraphMapping() {
String filename = "files/syngenta/CustomerFiles/SYNGENTA_EFSA_sanitisation_GFL_v1_moreSections.pdf";
String filename = "files/syngenta/CustomerFiles/Fludioxonil_duplicates.pdf";
Document document = buildGraph(filename);
DocumentData documentData = DocumentDataMapper.toDocumentData(document);

View File

@ -17,8 +17,9 @@ import com.knecon.fforesight.service.layoutparser.processor.python_api.model.tab
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.VisualLayoutParsingResponse;
import com.knecon.fforesight.service.layoutparser.processor.services.factory.DocumentGraphFactory;
import com.knecon.fforesight.service.layoutparser.processor.services.visualization.LayoutGridService;
import com.knecon.fforesight.service.layoutparser.server.PDFNetInitializer;
import com.knecon.fforesight.service.layoutparser.server.BuildDocumentTest;
import com.knecon.fforesight.service.layoutparser.server.PDFNetInitializer;
import com.knecon.fforesight.service.ocr.v1.api.model.IdpResult;
import com.knecon.fforesight.service.viewerdoc.service.PDFTronViewerDocumentService;
import lombok.SneakyThrows;
@ -74,6 +75,7 @@ public class ViewerDocumentTest extends BuildDocumentTest {
documentFile,
new ImageServiceResponse(),
tableResponse,
IdpResult.empty(),
new VisualLayoutParsingResponse(),
Map.of("file", Path.of(fileName).getFileName().toFile().toString()));
PDFTronViewerDocumentService viewerDocumentService = new PDFTronViewerDocumentService(null);

View File

@ -39,6 +39,8 @@ import com.knecon.fforesight.service.layoutparser.processor.python_api.model.tab
import com.knecon.fforesight.service.layoutparser.processor.services.PageContentExtractor;
import com.knecon.fforesight.service.layoutparser.processor.services.factory.DocumentGraphFactory;
import com.knecon.fforesight.service.layoutparser.server.AbstractTest;
import com.knecon.fforesight.service.ocr.v1.api.model.IdpResult;
import com.knecon.fforesight.service.viewerdoc.service.PDFTronViewerDocumentService;
import lombok.SneakyThrows;
@ -58,6 +60,7 @@ public class PdfSegmentationServiceTest extends AbstractTest {
originDocument,
new ImageServiceResponse(),
tableServiceResponse,
IdpResult.empty(),
new VisualLayoutParsingResponse(),
Map.of("file", "document"));
@ -103,24 +106,19 @@ public class PdfSegmentationServiceTest extends AbstractTest {
String textToSearch = "Annex to Regulation 283/2013 Annex to Regulation 284/2013";
ClassPathResource pdfFileResource = new ClassPathResource(fileName);
List<PageContents> textPositionPerPage = PageContentExtractor.getSortedPageContents(fileName);
List<PageContents> textPositionPerPage = PageContentExtractor.getDocumentContents(pdfFileResource.getFile(), 4);
var textPositions = textPositionPerPage.stream()
.flatMap(t -> t.getSortedWords()
.flatMap(t -> t.getWords()
.stream()
.map(Word::toString))
.collect(Collectors.joining(" "));
assertThat(textPositions.contains(textToSearch)).isFalse();
assertThat(textPositions.contains(textToSearch)).isTrue();
ClassificationDocument classificationDocument = buildClassificationDocument(pdfFileResource.getFile());
assertThat(classificationDocument.getHeaders()
.get(0).getTextBlocks().size()).isEqualTo(3);
assertThat(classificationDocument.getHeaders()
.get(0).getTextBlocks()
.get(0).getWords().size()).isEqualTo(8);
assertThat(classificationDocument.getHeaders()
.get(0).getTextBlocks()
.get(0).toString()).contains(textToSearch);
assertThat(classificationDocument.getHeaders().get(0).getTextBlocks().size()).isEqualTo(3);
assertThat(classificationDocument.getHeaders().get(0).getTextBlocks().get(0).getWords().size()).isEqualTo(8);
assertThat(classificationDocument.getHeaders().get(0).getTextBlocks().get(0).toString()).contains(textToSearch);
Document document = DocumentGraphFactory.buildDocumentGraph(LayoutParsingType.REDACT_MANAGER_WITHOUT_DUPLICATE_PARAGRAPH, classificationDocument).document();
@ -216,8 +214,7 @@ public class PdfSegmentationServiceTest extends AbstractTest {
.stream()
.filter(abstractPageBlock -> abstractPageBlock instanceof TablePageBlock))
.map(abstractPageBlock -> (TablePageBlock) abstractPageBlock)
.toList()
.get(0);
.toList().get(0);
assertThat(table.getColCount()).isEqualTo(6);
assertThat(table.getRowCount()).isEqualTo(13);
assertThat(table.getRows()
@ -246,8 +243,7 @@ public class PdfSegmentationServiceTest extends AbstractTest {
.stream()
.filter(abstractPageBlock -> abstractPageBlock instanceof TablePageBlock))
.map(abstractPageBlock -> (TablePageBlock) abstractPageBlock)
.toList()
.get(0);
.toList().get(0);
assertThat(firstTable.getColCount()).isEqualTo(8);
assertThat(firstTable.getRowCount()).isEqualTo(1);
TablePageBlock secondTable = document.getSectionTree().getAllTableOfContentItems()
@ -256,12 +252,10 @@ public class PdfSegmentationServiceTest extends AbstractTest {
.stream()
.filter(abstractPageBlock -> abstractPageBlock instanceof TablePageBlock))
.map(abstractPageBlock -> (TablePageBlock) abstractPageBlock)
.toList()
.get(1);
.toList().get(1);
assertThat(secondTable.getColCount()).isEqualTo(8);
assertThat(secondTable.getRowCount()).isEqualTo(2);
List<List<Cell>> firstTableHeaderCells = firstTable.getRows()
.get(0)
List<List<Cell>> firstTableHeaderCells = firstTable.getRows().get(0)
.stream()
.map(Collections::singletonList)
.collect(Collectors.toList());
@ -293,8 +287,7 @@ public class PdfSegmentationServiceTest extends AbstractTest {
.stream()
.filter(abstractPageBlock -> abstractPageBlock instanceof TablePageBlock))
.map(abstractPageBlock -> (TablePageBlock) abstractPageBlock)
.toList()
.get(0);
.toList().get(0);
assertThat(firstTable.getColCount()).isEqualTo(9);
assertThat(firstTable.getRowCount()).isEqualTo(5);
TablePageBlock secondTable = document.getSectionTree().getAllTableOfContentItems()
@ -303,12 +296,10 @@ public class PdfSegmentationServiceTest extends AbstractTest {
.stream()
.filter(abstractPageBlock -> abstractPageBlock instanceof TablePageBlock))
.map(abstractPageBlock -> (TablePageBlock) abstractPageBlock)
.toList()
.get(1);
.toList().get(1);
assertThat(secondTable.getColCount()).isEqualTo(9);
assertThat(secondTable.getRowCount()).isEqualTo(6);
List<List<Cell>> firstTableHeaderCells = firstTable.getRows()
.get(firstTable.getRowCount() - 1)
List<List<Cell>> firstTableHeaderCells = firstTable.getRows().get(firstTable.getRowCount() - 1)
.stream()
.map(Cell::getHeaderCells)
.collect(Collectors.toList());
@ -340,8 +331,7 @@ public class PdfSegmentationServiceTest extends AbstractTest {
.stream()
.filter(abstractPageBlock -> abstractPageBlock instanceof TablePageBlock))
.map(abstractPageBlock -> (TablePageBlock) abstractPageBlock)
.toList()
.get(0);
.toList().get(0);
assertThat(firstTable.getColCount()).isEqualTo(8);
assertThat(firstTable.getRowCount()).isEqualTo(1);
TablePageBlock secondTable = document.getSectionTree().getAllTableOfContentItems()
@ -350,12 +340,10 @@ public class PdfSegmentationServiceTest extends AbstractTest {
.stream()
.filter(abstractPageBlock -> abstractPageBlock instanceof TablePageBlock))
.map(abstractPageBlock -> (TablePageBlock) abstractPageBlock)
.toList()
.get(1);
.toList().get(1);
assertThat(secondTable.getColCount()).isEqualTo(8);
assertThat(secondTable.getRowCount()).isEqualTo(6);
List<List<Cell>> firstTableHeaderCells = firstTable.getRows()
.get(0)
List<List<Cell>> firstTableHeaderCells = firstTable.getRows().get(0)
.stream()
.map(Collections::singletonList)
.collect(Collectors.toList());
@ -376,10 +364,10 @@ public class PdfSegmentationServiceTest extends AbstractTest {
validateTableSize(document, 4);
validateTable(document, 0, 1, 1, 0, 0);
validateTable(document, 1, 2, 2, 0, 0);
validateTable(document, 2, 4, 19, 12, 0);
validateTable(document, 3, 2, 12, 0, 0);
validateTable(document, 0, 1, 1, 0);
validateTable(document, 1, 2, 2, 0);
validateTable(document, 2, 2, 12, 0);
validateTable(document, 3, 4, 19, 12);
}
@ -393,10 +381,10 @@ public class PdfSegmentationServiceTest extends AbstractTest {
validateTableSize(document, 4);
validateTable(document, 0, 5, 4, 0, 0);
validateTable(document, 1, 5, 15, 14, 0);
validateTable(document, 2, 5, 14, 11, 0);
validateTable(document, 3, 5, 3, 0, 0);
validateTable(document, 0, 5, 4, 0);
validateTable(document, 1, 5, 15, 14);
validateTable(document, 2, 5, 14, 11);
validateTable(document, 3, 5, 3, 0);
}
@ -410,7 +398,7 @@ public class PdfSegmentationServiceTest extends AbstractTest {
validateTableSize(document, 1);
validateTable(document, 0, 8, 8, 0, 0);
validateTable(document, 0, 8, 8, 0);
List<List<String>> values = Arrays.asList(Arrays.asList("Annex point Reference within DAR/RAR",
"Author, date",
@ -455,10 +443,10 @@ public class PdfSegmentationServiceTest extends AbstractTest {
validateTableSize(document, 4);
validateTable(document, 0, 3, 2, 0, 0);
validateTable(document, 1, 3, 2, 0, 0);
validateTable(document, 2, 3, 3, 0, 0);
validateTable(document, 3, 3, 3, 0, 0);
validateTable(document, 0, 3, 2, 0);
validateTable(document, 1, 3, 2, 0);
validateTable(document, 2, 3, 3, 0);
validateTable(document, 3, 3, 3, 0);
}
@ -473,7 +461,7 @@ public class PdfSegmentationServiceTest extends AbstractTest {
validateTableSize(document, 1);
validateTable(document, 0, 7, 4, 0, 0);
validateTable(document, 0, 7, 4, 0);
}
@ -486,7 +474,7 @@ public class PdfSegmentationServiceTest extends AbstractTest {
validateTableSize(document, 1);
validateTable(document, 0, 7, 4, 0, 0);
validateTable(document, 0, 7, 4, 0);
}
@ -499,12 +487,12 @@ public class PdfSegmentationServiceTest extends AbstractTest {
validateTableSize(document, 6);
validateTable(document, 0, 2, 1, 0, 0);
validateTable(document, 1, 2, 1, 0, 0);
validateTable(document, 2, 2, 5, 0, 0);
validateTable(document, 3, 2, 5, 0, 0);
validateTable(document, 4, 2, 4, 0, 0);
validateTable(document, 5, 2, 1, 0, 0);
validateTable(document, 0, 2, 1, 0);
validateTable(document, 1, 2, 1, 0);
validateTable(document, 2, 2, 5, 0);
validateTable(document, 3, 2, 5, 0);
validateTable(document, 4, 2, 4, 0);
validateTable(document, 5, 2, 1, 0);
}
@ -518,9 +506,9 @@ public class PdfSegmentationServiceTest extends AbstractTest {
validateTableSize(document, 3);
validateTable(document, 0, 7, 9, 0, 0);
validateTable(document, 1, 2, 1, 0, 0);
validateTable(document, 2, 2, 10, 0, 0);
validateTable(document, 0, 7, 9, 0);
validateTable(document, 1, 2, 1, 0);
validateTable(document, 2, 2, 10, 0);
}
@ -533,7 +521,7 @@ public class PdfSegmentationServiceTest extends AbstractTest {
ClassificationDocument document = buildClassificationDocument(pdfFileResource.getFile());
validateTableSize(document, 1);
validateTable(document, 0, 9, 9, 0, 0);
validateTable(document, 0, 9, 9, 0);
}
@ -547,7 +535,7 @@ public class PdfSegmentationServiceTest extends AbstractTest {
validateTableSize(document, 1);
validateTable(document, 0, 9, 5, 6, 0);
validateTable(document, 0, 9, 5, 6);
}
@ -560,7 +548,7 @@ public class PdfSegmentationServiceTest extends AbstractTest {
ClassificationDocument document = buildClassificationDocument(pdfFileResource.getFile());
validateTableSize(document, 1);
validateTable(document, 0, 9, 6, 7, 0);
validateTable(document, 0, 9, 6, 7);
}
@ -574,7 +562,7 @@ public class PdfSegmentationServiceTest extends AbstractTest {
ClassificationDocument document = buildClassificationDocument(pdfFileResource.getFile());
validateTableSize(document, 1);
validateTable(document, 0, 10, 6, 0, 0);
validateTable(document, 0, 10, 6, 0);
}
@ -588,8 +576,8 @@ public class PdfSegmentationServiceTest extends AbstractTest {
ClassificationDocument document = buildClassificationDocument(pdfFileResource.getFile());
validateTableSize(document, 2);
validateTable(document, 0, 2, 2, 0, 0);
validateTable(document, 1, 1, 1, 0, 0);
validateTable(document, 0, 2, 2, 0);
validateTable(document, 1, 1, 1, 0);
}
@ -604,8 +592,8 @@ public class PdfSegmentationServiceTest extends AbstractTest {
validateTableSize(document, 2);
validateTable(document, 0, 7, 8, 1, 0);
validateTable(document, 1, 7, 8, 1, 0);
validateTable(document, 0, 7, 8, 1);
validateTable(document, 1, 7, 8, 1);
}
@ -620,8 +608,8 @@ public class PdfSegmentationServiceTest extends AbstractTest {
validateTableSize(document, 2);
validateTable(document, 0, 4, 17, 0, 0);
validateTable(document, 1, 7, 12, 0, 0);
validateTable(document, 0, 4, 17, 0);
validateTable(document, 1, 7, 12, 0);
}
@ -636,8 +624,8 @@ public class PdfSegmentationServiceTest extends AbstractTest {
validateTableSize(document, 2);
validateTable(document, 0, 5, 14, 4, 0);
validateTable(document, 1, 7, 12, 0, 0);
validateTable(document, 0, 5, 14, 4);
validateTable(document, 1, 7, 12, 0);
}
@ -651,8 +639,8 @@ public class PdfSegmentationServiceTest extends AbstractTest {
validateTableSize(document, 2);
validateTable(document, 0, 5, 17, 3, 0);
validateTable(document, 1, 5, 16, 2, 0);
validateTable(document, 0, 5, 17, 3);
validateTable(document, 1, 5, 16, 2);
}
@ -666,10 +654,10 @@ public class PdfSegmentationServiceTest extends AbstractTest {
validateTableSize(document, 4);
validateTable(document, 0, 4, 4, 0, 0);
validateTable(document, 1, 1, 1, 0, 0);
validateTable(document, 2, 2, 3, 0, 0);
validateTable(document, 3, 1, 1, 0, 0);
validateTable(document, 0, 4, 4, 0);
validateTable(document, 1, 1, 1, 0);
validateTable(document, 2, 2, 3, 0);
validateTable(document, 3, 1, 1, 0);
}
@ -684,7 +672,7 @@ public class PdfSegmentationServiceTest extends AbstractTest {
validateTableSize(document, 1);
validateTable(document, 0, 11, 8, 0, 0);
validateTable(document, 0, 11, 8, 0);
}
@ -699,8 +687,8 @@ public class PdfSegmentationServiceTest extends AbstractTest {
validateTableSize(document, 2);
validateTable(document, 0, 6, 8, 0, 0);
validateTable(document, 1, 6, 8, 0, 0);
validateTable(document, 0, 6, 8, 0);
validateTable(document, 1, 6, 8, 0);
}
@ -714,7 +702,7 @@ public class PdfSegmentationServiceTest extends AbstractTest {
validateTableSize(document, 1);
validateTable(document, 0, 9, 5, 2, 0);
validateTable(document, 0, 9, 5, 2);
}
@ -728,7 +716,7 @@ public class PdfSegmentationServiceTest extends AbstractTest {
validateTableSize(document, 1);
validateTable(document, 0, 3, 5, 0, 0);
validateTable(document, 0, 3, 5, 0);
}
@ -742,7 +730,7 @@ public class PdfSegmentationServiceTest extends AbstractTest {
validateTableSize(document, 1);
validateTable(document, 0, 6, 8, 0, 0);
validateTable(document, 0, 6, 8, 0);
}
@ -755,10 +743,10 @@ public class PdfSegmentationServiceTest extends AbstractTest {
validateTableSize(document, 4);
validateTable(document, 0, 3, 3, 0, 0);
validateTable(document, 1, 3, 6, 2, 0);
validateTable(document, 2, 3, 3, 1, 0);
validateTable(document, 3, 3, 3, 0, 0);
validateTable(document, 0, 3, 6, 0);
validateTable(document, 1, 3, 3, 0);
validateTable(document, 2, 3, 3, 0);
validateTable(document, 3, 3, 3, 0);
}
@ -772,12 +760,12 @@ public class PdfSegmentationServiceTest extends AbstractTest {
validateTableSize(document, 6);
validateTable(document, 0, 5, 5, 0, 0);
validateTable(document, 1, 5, 6, 0, 0);
validateTable(document, 2, 5, 5, 0, 0);
validateTable(document, 3, 5, 5, 0, 0);
validateTable(document, 4, 5, 5, 0, 0);
validateTable(document, 5, 5, 5, 0, 0);
validateTable(document, 0, 5, 6, 0);
validateTable(document, 1, 5, 5, 0);
validateTable(document, 2, 5, 5, 0);
validateTable(document, 3, 5, 5, 0);
validateTable(document, 4, 5, 5, 0);
validateTable(document, 5, 5, 5, 0);
}
@ -791,7 +779,7 @@ public class PdfSegmentationServiceTest extends AbstractTest {
validateTableSize(document, 1);
validateTable(document, 0, 6, 5, 0, 0);
validateTable(document, 0, 6, 5, 0);
}
@ -805,7 +793,7 @@ public class PdfSegmentationServiceTest extends AbstractTest {
validateTableSize(document, 1);
validateTable(document, 0, 5, 8, 1, 0);
validateTable(document, 0, 5, 8, 0);
}
@ -816,13 +804,14 @@ public class PdfSegmentationServiceTest extends AbstractTest {
ClassPathResource pdfFileResource = new ClassPathResource("files/syngenta/CustomerFiles/SinglePages/T5_Page16_VV-640252.pdf");
ClassificationDocument document = buildClassificationDocument(pdfFileResource.getFile());
validateTableSize(document, 5);
validateTable(document, 0, 1, 1, 0, 0);
validateTable(document, 1, 1, 1, 0, 0);
validateTable(document, 2, 1, 1, 0, 0);
validateTable(document, 3, 1, 1, 0, 0);
validateTable(document, 4, 1, 1, 0, 0);
validateTableSize(document, 6);
// does not make sense to assert anything here other than that it runs. This is not a Table and completely breaks the current table detection logic
// viewerDocumentService.addLayerGroups(pdfFileResource.getFile(), new File("/tmp/cellDebug.pdf"), List.of(document.getLayoutDebugLayer()));
// validateTable(document, 0, 1, 1, 0);
// validateTable(document, 1, 1, 1, 0);
// validateTable(document, 2, 1, 1, 3);
// validateTable(document, 3, 1, 1, 0);
// validateTable(document, 4, 1, 1, 0);
}
@ -836,7 +825,7 @@ public class PdfSegmentationServiceTest extends AbstractTest {
validateTableSize(document, 1);
validateTable(document, 0, 6, 6, 5, 0);
validateTable(document, 0, 6, 6, 5);
}
@ -869,7 +858,7 @@ public class PdfSegmentationServiceTest extends AbstractTest {
}
private void validateTable(ClassificationDocument document, int tableIndex, int colCount, int rowCount, int emptyCellsCountCorrect, int emptyCellsCountIncorrect) {
private void validateTable(ClassificationDocument document, int tableIndex, int colCount, int rowCount, int emptyCellsCountCorrect) {
TablePageBlock table = document.getSectionTree().getAllTableOfContentItems()
.stream()
@ -877,8 +866,7 @@ public class PdfSegmentationServiceTest extends AbstractTest {
.stream()
.filter(abstractPageBlock -> abstractPageBlock instanceof TablePageBlock))
.map(abstractPageBlock -> (TablePageBlock) abstractPageBlock)
.toList()
.get(tableIndex);
.toList().get(tableIndex);
List<List<Cell>> rows = table.getRows();
int emptyCellsFoundFound = rows.stream()
@ -891,7 +879,7 @@ public class PdfSegmentationServiceTest extends AbstractTest {
for (List<Cell> row : table.getRows()) {
row.forEach(r -> System.out.println(r.toString()));
}
assertThat(emptyCellsFoundFound).isEqualTo(emptyCellsCountCorrect + emptyCellsCountIncorrect);
assertThat(emptyCellsFoundFound).isEqualTo(emptyCellsCountCorrect);
assertThat(table.getColCount()).isEqualTo(colCount);
assertThat(table.getRowCount()).isEqualTo(rowCount);
@ -907,8 +895,7 @@ public class PdfSegmentationServiceTest extends AbstractTest {
.stream()
.filter(abstractPageBlock -> abstractPageBlock instanceof TablePageBlock))
.map(abstractPageBlock -> (TablePageBlock) abstractPageBlock)
.toList()
.get(tableIndex);
.toList().get(tableIndex);
List<List<Cell>> rows = table.getRows();
List<Cell> rowsFlattened = rows.stream()

View File

@ -6,14 +6,10 @@ import java.util.List;
import org.junit.jupiter.api.Disabled;
import org.junit.jupiter.api.Test;
import org.springframework.core.io.ClassPathResource;
import com.knecon.fforesight.service.layoutparser.processor.model.GapInformation;
import com.knecon.fforesight.service.layoutparser.processor.model.PageContents;
import com.knecon.fforesight.service.layoutparser.processor.model.PageInformation;
import com.knecon.fforesight.service.layoutparser.processor.services.DividingColumnDetectionService;
import com.knecon.fforesight.service.layoutparser.processor.services.GapDetectionService;
import com.knecon.fforesight.service.layoutparser.processor.services.GapsAcrossLinesService;
import com.knecon.fforesight.service.layoutparser.processor.services.PageInformationService;
import com.knecon.fforesight.service.layoutparser.processor.experimental.DividingColumnDetectionService;
import com.knecon.fforesight.service.layoutparser.processor.services.PageContentExtractor;
import com.knecon.fforesight.service.layoutparser.server.utils.visualizations.PdfDraw;
@ -21,32 +17,6 @@ import lombok.SneakyThrows;
class GapAcrossLinesDetectionServiceTest {
@Test
@Disabled
@SneakyThrows
public void testGapBasedColumnDetection() {
String filename = "files/basf/CustomerFiles/invisible_tables_test-two-pages_ocred.pdf";
var tmpFileName = "/tmp/" + filename.split("/")[2] + "_COLUMNS.pdf";
System.out.println("start TextPosition extraction");
long start = System.currentTimeMillis();
List<PageInformation> pageInformations = PageContentExtractor.getSortedPageContents(filename).stream().map(PageInformationService::build).toList();
List<List<Rectangle2D>> columnsPerPage = new LinkedList<>();
System.out.printf("Finished TextPosition Extraction in %d ms%n", System.currentTimeMillis() - start);
System.out.println("start column detection");
start = System.currentTimeMillis();
for (PageInformation pageInformation : pageInformations) {
GapInformation gapInformation = GapDetectionService.findGapsInLines(pageInformation.getPageContents().getSortedWords(), pageInformation.getMainBodyTextFrame());
columnsPerPage.add(GapsAcrossLinesService.detectXGapsAcrossLines(gapInformation, pageInformation.getMainBodyTextFrame()));
}
System.out.printf("Finished column detection in %d ms%n", System.currentTimeMillis() - start);
System.out.println("start draw rectangles");
start = System.currentTimeMillis();
PdfDraw.drawRectanglesPerPage(filename, columnsPerPage, tmpFileName);
System.out.printf("Finished drawing rectangles in %d ms%n", System.currentTimeMillis() - start);
}
@Test
@Disabled
@SneakyThrows
@ -56,7 +26,7 @@ class GapAcrossLinesDetectionServiceTest {
var tmpFileName = "/tmp/" + filename.split("/")[2] + "_COLUMNS.pdf";
System.out.println("start TextPosition extraction");
long start = System.currentTimeMillis();
List<PageContents> sortedTextPositionSequencesPerPage = PageContentExtractor.getSortedPageContents(filename);
List<PageContents> sortedTextPositionSequencesPerPage = PageContentExtractor.getDocumentContents(new ClassPathResource(filename).getFile(), 4);
List<List<Rectangle2D>> columnsPerPage = new LinkedList<>();
System.out.printf("Finished TextPosition Extraction in %d ms%n", System.currentTimeMillis() - start);
System.out.println("start column detection");

View File

@ -1,66 +0,0 @@
package com.knecon.fforesight.service.layoutparser.server.services;
import java.awt.geom.Rectangle2D;
import java.nio.file.Path;
import java.util.Collection;
import java.util.Collections;
import java.util.List;
import java.util.stream.Collectors;
import org.junit.jupiter.api.Test;
import com.knecon.fforesight.service.layoutparser.processor.model.PageInformation;
import com.knecon.fforesight.service.layoutparser.processor.model.text.Word;
import com.knecon.fforesight.service.layoutparser.processor.services.InvisibleTableDetectionService;
import com.knecon.fforesight.service.layoutparser.processor.services.PageInformationService;
import com.knecon.fforesight.service.layoutparser.processor.services.PageContentExtractor;
import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations;
import com.knecon.fforesight.service.layoutparser.server.utils.visualizations.PdfDraw;
import lombok.SneakyThrows;
class InvisibleTableDetectionServiceTest {
@Test
// @Disabled
@SneakyThrows
public void detectInvisibleTableTest() {
String fileName = "files/basf/CustomerFiles/invisible_tables_test-two-pages_ocred.pdf";
var tmpFileName = Path.of("/tmp/").resolve(Path.of(fileName).getFileName() + "_TABLE.pdf").toString();
List<PageInformation> pageContents = PageContentExtractor.getSortedPageContents(fileName)
.stream()
.map(PageInformationService::build)
.collect(Collectors.toList());
int pageNumber = 1;
Rectangle2D tableBBox = pageContents.get(0).getPageContents().getSortedWords().subList(45, 152)
.stream()
.map(Word::getBBox)
.map(this::mirrorY)
.collect(RectangleTransformations.collectBBox());
List<Word> words = pageContents.get(0).getPageContents().getSortedWords()
.stream()
.filter(textPositionSequence -> tableBBox.contains(mirrorY(textPositionSequence.getBBox())))
.toList();
var table = InvisibleTableDetectionService.detectTable(words, tableBBox);
PdfDraw.drawRectanglesPerPage(fileName,
List.of(table.stream()
.flatMap(Collection::stream)
.toList(), Collections.emptyList()),
tmpFileName);
}
private Rectangle2D mirrorY(Rectangle2D rectangle2D) {
if (rectangle2D.getHeight() >= 0) {
return rectangle2D;
}
return new Rectangle2D.Double(rectangle2D.getX(), rectangle2D.getY() + rectangle2D.getHeight(), rectangle2D.getWidth(), -rectangle2D.getHeight());
}
}

View File

@ -5,6 +5,7 @@ import java.util.List;
import org.junit.jupiter.api.Disabled;
import org.junit.jupiter.api.Test;
import org.springframework.core.io.ClassPathResource;
import com.knecon.fforesight.service.layoutparser.processor.model.PageContents;
import com.knecon.fforesight.service.layoutparser.processor.services.PageContentExtractor;
@ -20,7 +21,7 @@ class MainBodyTextFrameExtractionServiceTest {
String fileName = "files/basf/CustomerFiles/invisible_tables_test-two-pages_ocred.pdf";
String tmpFileName = Path.of("/tmp/").resolve(Path.of(fileName).getFileName() + "_MAIN_BODY.pdf").toString();
List<PageContents> sortedTextPositionSequence = PageContentExtractor.getSortedPageContents(fileName);
List<PageContents> sortedTextPositionSequence = PageContentExtractor.getDocumentContents(new ClassPathResource(fileName).getFile(), 4);
}

View File

@ -4,6 +4,7 @@ import java.nio.file.Path;
import java.util.List;
import org.junit.jupiter.api.Test;
import org.springframework.core.io.ClassPathResource;
import com.knecon.fforesight.service.layoutparser.processor.model.PageContents;
import com.knecon.fforesight.service.layoutparser.processor.model.text.Word;
@ -21,11 +22,11 @@ class PageContentExtractorTest {
String fileName = "files/syngenta/CustomerFiles/Documine/Flora/SOLICITA_VICTRATO-GOLD-II_Item 21_Mutacao_Genica.pdf";
var tmpFileName = Path.of("/tmp/").resolve(Path.of(fileName).getFileName() + "_TEXT_POSITION_SEQUENCES.pdf").toString();
List<PageContents> textPositionPerPage = PageContentExtractor.getSortedPageContents(fileName);
List<PageContents> textPositionPerPage = PageContentExtractor.getDocumentContents(new ClassPathResource(fileName).getFile(), 4);
PdfDraw.drawRectanglesPerPageNumberedByLine(fileName,
textPositionPerPage.stream()
.map(t -> t.getSortedWords()
.map(t -> t.getWords()
.stream()
.map(Word::getBBoxPdf)
.map(List::of)

View File

@ -1,63 +0,0 @@
package com.knecon.fforesight.service.layoutparser.server.services;
import java.util.Collection;
import java.util.List;
import org.junit.jupiter.api.Disabled;
import org.junit.jupiter.api.Test;
import com.knecon.fforesight.service.layoutparser.processor.model.PageInformation;
import com.knecon.fforesight.service.layoutparser.processor.services.PageInformationService;
import com.knecon.fforesight.service.layoutparser.processor.services.PageContentExtractor;
import com.knecon.fforesight.service.layoutparser.server.utils.visualizations.PdfDraw;
import lombok.SneakyThrows;
class PageInformationServiceTest {
@Test
@Disabled
@SneakyThrows
public void testGapDetection() {
String filename = "files/basf/CustomerFiles/invisible_tables_test-two-pages_ocred.pdf";
var tmpFileName = "/tmp/" + filename.split("/")[2] + "_GAPS.pdf";
System.out.println("start TextPosition extraction");
long start = System.currentTimeMillis();
List<PageInformation> pageInformations = PageContentExtractor.getSortedPageContents(filename).stream().map(PageInformationService::build).toList();
System.out.printf("Finished TextPosition Extraction in %d ms%n", System.currentTimeMillis() - start);
System.out.println("start gap detection");
start = System.currentTimeMillis();
System.out.printf("Finished gap detection in %d ms%n", System.currentTimeMillis() - start);
System.out.println("start draw rectangles");
start = System.currentTimeMillis();
PdfDraw.drawRectanglesAndLinesPerPage(filename,
pageInformations.stream().map(PageInformation::getGapInformation).map(gaps -> gaps.getYGaps().stream().flatMap(Collection::stream).toList()).toList(),
pageInformations.stream().map(PageInformation::getGapInformation).map(gaps -> gaps.getXGaps().stream().flatMap(Collection::stream).toList()).toList(),
tmpFileName);
System.out.printf("Finished drawing rectangles in %d ms%n", System.currentTimeMillis() - start);
}
@Test
@Disabled
@SneakyThrows
public void testLineDetection() {
String filename = "files/basf/CustomerFiles/invisible_tables_test-two-pages_ocred.pdf";
var tmpFileName = "/tmp/" + filename.split("/")[2] + "_GAPS.pdf";
System.out.println("start TextPosition extraction");
long start = System.currentTimeMillis();
List<PageInformation> pageInformations = PageContentExtractor.getSortedPageContents(filename).stream().map(PageInformationService::build).toList();
System.out.printf("Finished TextPosition Extraction in %d ms%n", System.currentTimeMillis() - start);
System.out.println("start gap detection");
start = System.currentTimeMillis();
System.out.printf("Finished gap detection in %d ms%n", System.currentTimeMillis() - start);
System.out.println("start draw rectangles");
start = System.currentTimeMillis();
PdfDraw.drawRectanglesPerPageNumberedByLine(filename,
pageInformations.stream().map(PageInformation::getLineInformation).map(gaps -> gaps.getBBoxWithGapsByLines().stream().toList()).toList(),
tmpFileName);
System.out.printf("Finished drawing rectangles in %d ms%n", System.currentTimeMillis() - start);
}
}

View File

@ -34,9 +34,10 @@ import com.knecon.fforesight.service.layoutparser.processor.services.PageContent
import com.knecon.fforesight.service.layoutparser.processor.services.RulingCleaningService;
import com.knecon.fforesight.service.layoutparser.processor.services.factory.DocumentGraphFactory;
import com.knecon.fforesight.service.layoutparser.processor.utils.DrawingOptions;
import com.knecon.fforesight.service.layoutparser.processor.utils.RectangularIntersectionFinder;
import com.knecon.fforesight.service.layoutparser.processor.services.tables.RectangularIntersectionFinder;
import com.knecon.fforesight.service.layoutparser.server.BuildDocumentTest;
import com.knecon.fforesight.service.layoutparser.server.utils.visualizations.PdfDraw;
import com.knecon.fforesight.service.ocr.v1.api.model.IdpResult;
import lombok.SneakyThrows;
@ -49,7 +50,7 @@ public class RulingCleaningServiceTest extends BuildDocumentTest {
String fileName = "files/syngenta/CustomerFiles/SinglePages/T5_Page16_VV-640252.pdf";
String lineFileName = "/tmp/" + Path.of(fileName).getFileName().toString() + "_CELLS.pdf";
List<PageContents> pageContents = PageContentExtractor.getSortedPageContents(fileName);
List<PageContents> pageContents = PageContentExtractor.getDocumentContents(new ClassPathResource(fileName).getFile(), 4);
RulingCleaningService rulingCleaningService = new RulingCleaningService();
List<List<Rectangle2D>> rectanglesPerPage = new LinkedList<>();
for (PageContents pageContent : pageContents) {
@ -69,7 +70,7 @@ public class RulingCleaningServiceTest extends BuildDocumentTest {
String fileName = "files/syngenta/CustomerFiles/SinglePages/Page35_19 Chlorothalonil RAR 08 Volume 3CA B 6b metabolites Oct 2017.pdf";
String lineFileName = "/tmp/" + Path.of(fileName).getFileName().toString() + "_LINES.pdf";
List<PageContents> pageContents = PageContentExtractor.getSortedPageContents(fileName);
List<PageContents> pageContents = PageContentExtractor.getDocumentContents(new ClassPathResource(fileName).getFile(), 4);
RulingCleaningService rulingCleaningService = new RulingCleaningService();
List<CleanRulings> cleanRulingsPerPage = new LinkedList<>();
for (PageContents pageContent : pageContents) {
@ -110,6 +111,7 @@ public class RulingCleaningServiceTest extends BuildDocumentTest {
filename.toFile(),
new ImageServiceResponse(),
new TableServiceResponse(),
IdpResult.empty(),
new VisualLayoutParsingResponse(),
Map.of("file", filename.toFile().toString()))).document();
Document documentGraphAfter = DocumentGraphFactory.buildDocumentGraph(LayoutParsingType.REDACT_MANAGER_WITHOUT_DUPLICATE_PARAGRAPH,
@ -117,6 +119,7 @@ public class RulingCleaningServiceTest extends BuildDocumentTest {
filename.toFile(),
new ImageServiceResponse(),
new TableServiceResponse(),
IdpResult.empty(),
new VisualLayoutParsingResponse(),
Map.of("file", filename.toFile().toString()))).document();
DocumentData documentDataBefore = DocumentDataMapper.toDocumentData(documentGraphBefore);

View File

@ -7,6 +7,7 @@ import java.util.Collections;
import java.util.List;
import org.junit.jupiter.api.Test;
import org.springframework.core.io.ClassPathResource;
import com.knecon.fforesight.service.layoutparser.processor.model.PageContents;
import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings;
@ -15,7 +16,7 @@ import com.knecon.fforesight.service.layoutparser.processor.model.text.Word;
import com.knecon.fforesight.service.layoutparser.processor.services.PageContentExtractor;
import com.knecon.fforesight.service.layoutparser.processor.services.RulingCleaningService;
import com.knecon.fforesight.service.layoutparser.processor.services.TextRulingsClassifier;
import com.knecon.fforesight.service.layoutparser.processor.utils.RectangularIntersectionFinder;
import com.knecon.fforesight.service.layoutparser.processor.services.tables.RectangularIntersectionFinder;
import lombok.SneakyThrows;
@ -26,19 +27,19 @@ public class RulingsClassifierTest {
public void textRulingExtractionTest() {
String fileName = "files/Minimal Examples/RotateTextWithRulingsTestFile.pdf";
List<PageContents> pageContents = PageContentExtractor.getSortedPageContents(fileName);
List<PageContents> pageContents = PageContentExtractor.getDocumentContents(new ClassPathResource(fileName).getFile(), 4);
RulingCleaningService rulingCleaningService = new RulingCleaningService();
for (PageContents pageContent : pageContents) {
CleanRulings cleanRulings = rulingCleaningService.deduplicateAndStraightenRulings(Collections.emptyList(), pageContent.getRulings());
RectangularIntersectionFinder.find(cleanRulings.getHorizontals(), cleanRulings.getVerticals());
TextRulingsClassifier.classifyUnderlinedAndStrikethroughText(pageContent.getSortedWords(), cleanRulings);
TextRulingsClassifier.classifyUnderlinedAndStrikethroughText(pageContent.getWords(), cleanRulings);
assertTrue(pageContent.getSortedWords()
assertTrue(pageContent.getWords()
.stream()
.filter(word -> word.toString().equals("Underlined"))
.allMatch(Word::isUnderline));
assertTrue(pageContent.getSortedWords()
assertTrue(pageContent.getWords()
.stream()
.filter(word -> word.toString().equals("Striketrough"))
.allMatch(Word::isStrikethrough));
@ -64,13 +65,13 @@ public class RulingsClassifierTest {
public void tableRulingExtractionTest() {
String fileName = "files/SinglePages/AbsolutelyEnormousTable.pdf";
List<PageContents> pageContents = PageContentExtractor.getSortedPageContents(fileName);
List<PageContents> pageContents = PageContentExtractor.getDocumentContents(new ClassPathResource(fileName).getFile(), 4);
RulingCleaningService rulingCleaningService = new RulingCleaningService();
for (PageContents pageContent : pageContents) {
CleanRulings cleanRulings = rulingCleaningService.deduplicateAndStraightenRulings(Collections.emptyList(), pageContent.getRulings());
RectangularIntersectionFinder.find(cleanRulings.getHorizontals(), cleanRulings.getVerticals());
TextRulingsClassifier.classifyUnderlinedAndStrikethroughText(pageContent.getSortedWords(), cleanRulings);
TextRulingsClassifier.classifyUnderlinedAndStrikethroughText(pageContent.getWords(), cleanRulings);
assertEquals(30, cleanRulings.getHorizontals().size());
assertEquals(30, cleanRulings.getTableLines().getHorizontals().size());

Some files were not shown because too many files have changed in this diff Show More