Compare commits
7 Commits
main
...
RED-8670-n
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
f68a9a2335 | ||
|
|
39e20bad8d | ||
|
|
06618c2e9e | ||
|
|
5cee042c74 | ||
|
|
0a4cd759a7 | ||
|
|
59f7c7c6a3 | ||
|
|
d6b10a5e7e |
@ -10,38 +10,23 @@ import lombok.NonNull;
|
||||
@Builder
|
||||
@Schema(description = "Object containing all storage paths the service needs to know.")
|
||||
public record LayoutParsingRequest(
|
||||
@Schema(description = "Enum specifying the type of layout parsing to be performed.", allowableValues = "{RedactManager, DocuMine, TAAS}")//
|
||||
@NonNull LayoutParsingType layoutParsingType,
|
||||
@Schema(description = "Enum specifying the type of layout parsing to be performed.", allowableValues = "{RedactManager, DocuMine, TAAS}") @NonNull LayoutParsingType layoutParsingType,
|
||||
@Schema(description = "General purpose identifiers. They are not changed by the service at all and are returned as is in the response queue.") Map<String, String> identifier,
|
||||
@Schema(description = "Path to the original PDF file.") @NonNull String originFileStorageId,
|
||||
|
||||
@Schema(description = "General purpose identifiers. They are not changed by the service at all and are returned as is in the response queue.")//
|
||||
Map<String, String> identifier,
|
||||
@Schema(description = "Optional Path to the table extraction file.") Optional<String> tablesFileStorageId,
|
||||
@Schema(description = "Optional Path to the image classification file.") Optional<String> imagesFileStorageId,
|
||||
@Schema(description = "Path where the IDP Result File is stored.") Optional<String> idpResultStorageId,
|
||||
@Schema(description = "Optional Path to the the visual layout parsing service file") Optional<String> visualLayoutParsingFileId,
|
||||
|
||||
@Schema(description = "Path to the original PDF file.")//
|
||||
@NonNull String originFileStorageId,//
|
||||
|
||||
@Schema(description = "Optional Path to the table extraction file.")//
|
||||
Optional<String> tablesFileStorageId,//
|
||||
@Schema(description = "Optional Path to the image classification file.")//
|
||||
Optional<String> imagesFileStorageId,//
|
||||
|
||||
@Schema(description = "Optional Path to the the visual layout parsing service file") Optional<String> visualLayoutParsingFileId,//
|
||||
|
||||
@Schema(description = "Path where the Document Structure File will be stored.")//
|
||||
@NonNull String structureFileStorageId,//
|
||||
@Schema(description = "Path where the Research Data File will be stored.")//
|
||||
String researchDocumentStorageId,//
|
||||
@Schema(description = "Path where the Document Text File will be stored.")//
|
||||
@NonNull String textBlockFileStorageId,//
|
||||
@Schema(description = "Path where the Document Positions File will be stored.")//
|
||||
@NonNull String positionBlockFileStorageId,//
|
||||
@Schema(description = "Path where the Document Pages File will be stored.")//
|
||||
@NonNull String pageFileStorageId,//
|
||||
@Schema(description = "Path where the Document Markdown File will be stored.")//
|
||||
Optional<String> documentMarkdownFileStorageId,//
|
||||
@Schema(description = "Path where the Simplified Text File will be stored.")//
|
||||
@NonNull String simplifiedTextStorageId,//
|
||||
@Schema(description = "Path where the Viewer Document PDF will be stored.")//
|
||||
@NonNull String viewerDocumentStorageId
|
||||
@Schema(description = "Path where the Document Structure File will be stored.") @NonNull String structureFileStorageId,
|
||||
@Schema(description = "Path where the Research Data File will be stored.") String researchDocumentStorageId,
|
||||
@Schema(description = "Path where the Document Text File will be stored.") @NonNull String textBlockFileStorageId,
|
||||
@Schema(description = "Path where the Document Positions File will be stored.") @NonNull String positionBlockFileStorageId,
|
||||
@Schema(description = "Path where the Document Pages File will be stored.") @NonNull String pageFileStorageId,
|
||||
@Schema(description = "Path where the Document Markdown File will be stored.") Optional<String> documentMarkdownFileStorageId,
|
||||
@Schema(description = "Path where the Simplified Text File will be stored.") @NonNull String simplifiedTextStorageId,
|
||||
@Schema(description = "Path where the Viewer Document PDF will be stored.") @NonNull String viewerDocumentStorageId
|
||||
) {
|
||||
|
||||
}
|
||||
|
||||
@ -23,6 +23,8 @@ dependencies {
|
||||
}
|
||||
implementation("com.iqser.red.commons:storage-commons:2.50.0")
|
||||
|
||||
api("com.knecon.fforesight:azure-ocr-service-api:0.25.0")
|
||||
|
||||
implementation("org.apache.pdfbox:pdfbox:${pdfBoxVersion}")
|
||||
implementation("org.apache.pdfbox:pdfbox-tools:${pdfBoxVersion}")
|
||||
implementation("com.fasterxml.jackson.module:jackson-module-afterburner:${jacksonVersion}")
|
||||
|
||||
@ -17,4 +17,6 @@ public class LayoutParserSettings {
|
||||
|
||||
boolean debug;
|
||||
LayoutParsingType layoutParsingTypeOverride;
|
||||
String pdftronLicense;
|
||||
int extractionThreads = 1;
|
||||
}
|
||||
|
||||
@ -14,39 +14,39 @@ import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.concurrent.atomic.AtomicReference;
|
||||
import java.util.function.Function;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import org.apache.pdfbox.Loader;
|
||||
import org.apache.pdfbox.pdmodel.PDDocument;
|
||||
import org.apache.pdfbox.pdmodel.PDPage;
|
||||
import org.apache.pdfbox.pdmodel.common.PDRectangle;
|
||||
import org.apache.pdfbox.pdmodel.documentinterchange.markedcontent.PDMarkedContent;
|
||||
import org.springframework.beans.factory.annotation.Value;
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import com.iqser.red.service.redaction.v1.server.mapper.DocumentDataMapper;
|
||||
import com.iqser.red.service.redaction.v1.server.model.document.nodes.ImageType;
|
||||
import com.iqser.red.service.redaction.v1.server.model.document.nodes.NodeType;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingFinishedEvent;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingRequest;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.Character;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.DocumentWithVisualization;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.RedTextPosition;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextDirection;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.classification.ClassificationService;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.mapper.MarkdownMapper;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.docstrum.service.ReadingOrderService;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationDocument;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.DocumentWithVisualization;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.PageContents;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.image.ClassifiedImage;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.outline.OutlineExtractorService;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.outline.SectionTree;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.outline.SectionTreeBuilderService;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.outline.SectionTreeEnhancementService;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.outline.SectionTree;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.Cell;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.TablePageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.RedTextPosition;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextDirection;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.Word;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.python_api.adapter.CvTableParsingAdapter;
|
||||
@ -56,24 +56,26 @@ import com.knecon.fforesight.service.layoutparser.processor.python_api.model.ima
|
||||
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableCells;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableServiceResponse;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.VisualLayoutParsingResponse;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.PageContentExtractor;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.RulingCleaningService;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.SectionsBuilderService;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.SimplifiedSectionTextService;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.TableExtractionService;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.tables.TableExtractionService;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.TextRulingsClassifier;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.blockification.BlockificationPostprocessingService;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.blockification.DocstrumBlockificationService;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.blockification.DocuMineBlockificationService;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.blockification.RedactManagerBlockificationService;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.blockification.BlockificationService;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.classification.ClassificationService;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.factory.DocumentGraphFactory;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.graphics.Box;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.graphics.GraphicExtractorService;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.mapper.MarkdownMapper;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.mapper.TaasDocumentDataMapper;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.parsing.PDFLinesTextStripper;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.visualization.LayoutGridService;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.MarkedContentUtils;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.PageInformation;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.PageInformation;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.TextPositionOperations;
|
||||
import com.knecon.fforesight.service.ocr.v1.api.model.IdpResult;
|
||||
import com.knecon.fforesight.service.ocr.v1.api.model.Table;
|
||||
|
||||
import io.micrometer.observation.Observation;
|
||||
import io.micrometer.observation.ObservationRegistry;
|
||||
@ -98,10 +100,8 @@ public class LayoutParsingPipeline {
|
||||
final SimplifiedSectionTextService simplifiedSectionTextService;
|
||||
final RulingCleaningService rulingCleaningService;
|
||||
final TableExtractionService tableExtractionService;
|
||||
final DocuMineBlockificationService docuMineBlockificationService;
|
||||
final RedactManagerBlockificationService redactManagerBlockificationService;
|
||||
final BlockificationService blockificationService;
|
||||
final BlockificationPostprocessingService blockificationPostprocessingService;
|
||||
final DocstrumBlockificationService docstrumBlockificationService;
|
||||
final LayoutGridService layoutGridService;
|
||||
final ObservationRegistry observationRegistry;
|
||||
final VisualLayoutParsingAdapter visualLayoutParsingAdapter;
|
||||
@ -111,11 +111,11 @@ public class LayoutParsingPipeline {
|
||||
final SectionTreeEnhancementService sectionTreeEnhancementService;
|
||||
final LayoutParserSettings settings;
|
||||
final ClassificationService classificationService;
|
||||
final ReadingOrderService readingOrderService;
|
||||
|
||||
@Value("${LAYOUT_PARSER_VERSION:}")
|
||||
private String layoutParserVersion;
|
||||
|
||||
|
||||
public LayoutParsingFinishedEvent parseLayoutAndSaveFilesToStorage(LayoutParsingRequest layoutParsingRequest) throws IOException {
|
||||
|
||||
long start = System.currentTimeMillis();
|
||||
@ -134,14 +134,16 @@ public class LayoutParsingPipeline {
|
||||
TableServiceResponse tableServiceResponse = layoutParsingRequest.tablesFileStorageId()
|
||||
.map(layoutParsingStorageService::getTablesFile)
|
||||
.orElse(new TableServiceResponse());
|
||||
IdpResult idpResult = layoutParsingRequest.idpResultStorageId()
|
||||
.map(layoutParsingStorageService::getIdpResultFile).orElse(IdpResult.empty());
|
||||
|
||||
LayoutParsingType layoutParsingType = settings.getLayoutParsingTypeOverride() == null //
|
||||
? layoutParsingRequest.layoutParsingType() : settings.getLayoutParsingTypeOverride();
|
||||
LayoutParsingType layoutParsingType = settings.getLayoutParsingTypeOverride() == null ? layoutParsingRequest.layoutParsingType() : settings.getLayoutParsingTypeOverride();
|
||||
|
||||
ClassificationDocument classificationDocument = parseLayout(layoutParsingType,
|
||||
originFile,
|
||||
imageServiceResponse,
|
||||
tableServiceResponse,
|
||||
idpResult,
|
||||
visualLayoutParsingResponse,
|
||||
layoutParsingRequest.identifier());
|
||||
|
||||
@ -159,7 +161,8 @@ public class LayoutParsingPipeline {
|
||||
if (layoutParsingRequest.documentMarkdownFileStorageId()
|
||||
.isPresent()) {
|
||||
layoutParsingStorageService.storeMarkdownFile(layoutParsingRequest.documentMarkdownFileStorageId()
|
||||
.get(), new MarkdownMapper().toMarkdownContent(documentWithVisualization.document()));
|
||||
.get(),
|
||||
new MarkdownMapper().toMarkdownContent(documentWithVisualization.document()));
|
||||
}
|
||||
layoutParsingStorageService.storeSimplifiedText(layoutParsingRequest, simplifiedSectionTextService.toSimplifiedText(documentWithVisualization.document()));
|
||||
layoutParsingStorageService.storeViewerDocument(layoutParsingRequest, viewerDocumentFile);
|
||||
@ -237,15 +240,22 @@ public class LayoutParsingPipeline {
|
||||
File originFile,
|
||||
ImageServiceResponse imageServiceResponse,
|
||||
TableServiceResponse tableServiceResponse,
|
||||
IdpResult idpResult,
|
||||
VisualLayoutParsingResponse visualLayoutParsingResponse,
|
||||
Map<String, String> identifier) {
|
||||
|
||||
PDDocument originDocument = openDocument(originFile);
|
||||
addNumberOfPagesToTrace(originDocument.getNumberOfPages(), Files.size(originFile.toPath()));
|
||||
PageContentExtractor extractor = new PageContentExtractor(originFile, settings.getExtractionThreads());
|
||||
extractor.startAsync();
|
||||
int pageCount = extractor.getPageCount();
|
||||
addNumberOfPagesToTrace(pageCount, Files.size(originFile.toPath()));
|
||||
|
||||
Map<Integer, List<TableCells>> pdfTableCells = cvTableParsingAdapter.buildCvParsedTablesPerPage(tableServiceResponse);
|
||||
Map<Integer, List<ClassifiedImage>> pdfImages = imageServiceResponseAdapter.buildClassifiedImagesPerPage(imageServiceResponse);
|
||||
Map<Integer, List<ClassifiedImage>> pdfImages = imageServiceResponseAdapter.buildClassifiedImagesPerPage(imageServiceResponse, idpResult);
|
||||
Map<Integer, List<ClassifiedImage>> signatures = visualLayoutParsingAdapter.buildExtractedSignaturesPerPage(visualLayoutParsingResponse);
|
||||
Function<Table, Integer> pageNumberExtractor = table -> table.bboxes().get(0).pageNumber();
|
||||
Map<Integer, List<Table>> idpTablesPerPage = idpResult.tables()
|
||||
.stream()
|
||||
.collect(Collectors.groupingBy(pageNumberExtractor));
|
||||
|
||||
ClassificationDocument classificationDocument = new ClassificationDocument();
|
||||
|
||||
@ -255,32 +265,20 @@ public class LayoutParsingPipeline {
|
||||
|
||||
List<ClassificationPage> classificationPages = new ArrayList<>();
|
||||
|
||||
classificationDocument.setOutlineObjectTree(outlineExtractorService.getOutlineObjectTree(originDocument));
|
||||
|
||||
long pageCount = originDocument.getNumberOfPages();
|
||||
classificationDocument.setOutlineObjectTree(outlineExtractorService.getOutlineObjectTree(originFile));
|
||||
|
||||
for (int pageNumber = 1; pageNumber <= pageCount; pageNumber++) {
|
||||
|
||||
if (pageNumber % 100 == 0) {
|
||||
// re-open document every once in a while to save on RAM. This has no significant performance impact.
|
||||
// This is due to PDFBox caching all images and some other stuff with Soft References. This dereferences them and forces the freeing of memory.
|
||||
originDocument.close();
|
||||
originDocument = openDocument(originFile);
|
||||
}
|
||||
|
||||
PageContents pageContents = extractor.awaitPageContents(pageNumber);
|
||||
if (pageNumber % 100 == 0 || pageNumber == pageCount || pageNumber == 1) {
|
||||
log.info("Extracting text on Page {} for {}", pageNumber, identifier);
|
||||
log.info("Processing text on Page {} for {}", pageNumber, identifier);
|
||||
}
|
||||
|
||||
classificationDocument.setPages(classificationPages);
|
||||
PDFLinesTextStripper stripper = new PDFLinesTextStripper();
|
||||
PDPage pdPage = originDocument.getPage(pageNumber - 1);
|
||||
stripper.setPageNumber(pageNumber);
|
||||
stripper.setStartPage(pageNumber);
|
||||
stripper.setEndPage(pageNumber);
|
||||
stripper.setPdpage(pdPage);
|
||||
stripper.getText(originDocument);
|
||||
List<Word> words = stripper.getWords();
|
||||
|
||||
List<Word> words = pageContents.getWords();
|
||||
List<Ruling> rulings = pageContents.getRulings();
|
||||
PageInformation pageInformation = pageContents.getPageInformation();
|
||||
|
||||
// rotateDirAdjExactly(words, pdPage); // works really well for many highly rotated documents (e.g. VV-331340.pdf), but it decreases the headline performance by 1.3%, so I am leaving it out for now
|
||||
|
||||
@ -291,39 +289,23 @@ public class LayoutParsingPipeline {
|
||||
}
|
||||
classificationDocument.getLayoutDebugLayer().addTextVisualizations(words, pageNumber);
|
||||
|
||||
PDRectangle pdr = pdPage.getMediaBox();
|
||||
|
||||
List<Ruling> rulings = stripper.getRulings();
|
||||
classificationDocument.getLayoutDebugLayer().addRulingVisualization(rulings, pageNumber);
|
||||
CleanRulings cleanRulings = rulingCleaningService.deduplicateAndStraightenRulings(pdfTableCells.get(pageNumber), rulings);
|
||||
|
||||
PageInformation pageInformation = PageInformation.fromPDPage(pageNumber, pdPage);
|
||||
List<Cell> emptyTableCells = TableExtractionService.findCells(cleanRulings.getHorizontals(), cleanRulings.getVerticals(), pageInformation);
|
||||
classificationDocument.getLayoutDebugLayer().addCellVisualizations(emptyTableCells, pageNumber);
|
||||
|
||||
classificationDocument.getLayoutDebugLayer().addCellVisualizations(emptyTableCells, pageNumber, null);
|
||||
TextRulingsClassifier.classifyUnderlinedAndStrikethroughText(words, cleanRulings);
|
||||
|
||||
List<Box> graphics = graphicExtractorService.extractPathElementGraphics(originDocument, pdPage, pageNumber, cleanRulings, stripper.getWords(), false);
|
||||
List<TablePageBlock> tables = tableExtractionService.extractTables(emptyTableCells, words, pageInformation, idpTablesPerPage.get(pageNumber), layoutParsingType);
|
||||
|
||||
pdfImages.computeIfAbsent(pageNumber, x -> new ArrayList<>())
|
||||
.addAll(graphics.stream()
|
||||
.map(g -> new ClassifiedImage(new Rectangle2D.Double(g.x1, g.y1, g.width(), g.height()),
|
||||
ImageType.GRAPHIC,
|
||||
false,
|
||||
stripper.getPageNumber(),
|
||||
""))
|
||||
.toList());
|
||||
List<ClassifiedImage> graphics = graphicExtractorService.extractPathElementGraphics(pageContents.getGraphicBBoxes(), pageNumber, cleanRulings);
|
||||
pdfImages.computeIfAbsent(pageNumber, x -> new ArrayList<>()).addAll(graphics);
|
||||
|
||||
ClassificationPage classificationPage = switch (layoutParsingType) {
|
||||
case REDACT_MANAGER_OLD -> redactManagerBlockificationService.blockify(stripper.getWords(), cleanRulings, classificationDocument.getLayoutDebugLayer());
|
||||
case DOCUMINE_OLD -> docuMineBlockificationService.blockify(words, cleanRulings);
|
||||
case DOCUMINE, REDACT_MANAGER, REDACT_MANAGER_PARAGRAPH_DEBUG, REDACT_MANAGER_WITHOUT_DUPLICATE_PARAGRAPH ->
|
||||
docstrumBlockificationService.blockify(words, cleanRulings, true, classificationDocument.getLayoutDebugLayer(), layoutParsingType);
|
||||
case CLARIFYND, CLARIFYND_PARAGRAPH_DEBUG ->
|
||||
docstrumBlockificationService.blockify(words, cleanRulings, false, classificationDocument.getLayoutDebugLayer(), layoutParsingType);
|
||||
};
|
||||
List<TextPageBlock> textBlocks = blockificationService.blockify(layoutParsingType, words, cleanRulings, classificationDocument.getLayoutDebugLayer());
|
||||
|
||||
updateClassificationPage(pdPage, pdr, classificationPage, cleanRulings, pageNumber, pageInformation);
|
||||
List<AbstractPageBlock> blocks = readingOrderService.resolve(textBlocks, tables);
|
||||
|
||||
ClassificationPage classificationPage = new ClassificationPage(blocks, pageInformation, cleanRulings);
|
||||
|
||||
blockificationPostprocessingService.findHeadlinesFromOutline(classificationDocument, pageNumber, classificationPage, pageInformation);
|
||||
|
||||
@ -345,16 +327,12 @@ public class LayoutParsingPipeline {
|
||||
}
|
||||
}
|
||||
|
||||
tableExtractionService.extractTables(emptyTableCells, classificationPage);
|
||||
|
||||
buildPageStatistics(classificationPage);
|
||||
increaseDocumentStatistics(classificationPage, classificationDocument);
|
||||
|
||||
classificationPages.add(classificationPage);
|
||||
}
|
||||
|
||||
originDocument.close();
|
||||
|
||||
classificationService.classify(classificationDocument, layoutParsingType, identifier);
|
||||
|
||||
SectionTree sectionTree = sectionTreeBuilderService.createSectionTree(classificationDocument);
|
||||
@ -371,24 +349,6 @@ public class LayoutParsingPipeline {
|
||||
}
|
||||
|
||||
|
||||
private static void updateClassificationPage(PDPage pdPage,
|
||||
PDRectangle pdr,
|
||||
ClassificationPage classificationPage,
|
||||
CleanRulings cleanRulings,
|
||||
int pageNumber,
|
||||
PageInformation pageInformation) {
|
||||
|
||||
int rotation = pdPage.getRotation();
|
||||
boolean isLandscape = pdr.getWidth() > pdr.getHeight() && (rotation == 0 || rotation == 180) || pdr.getHeight() > pdr.getWidth() && (rotation == 90 || rotation == 270);
|
||||
classificationPage.setCleanRulings(cleanRulings);
|
||||
classificationPage.setRotation(rotation);
|
||||
classificationPage.setLandscape(isLandscape);
|
||||
classificationPage.setPageNumber(pageNumber);
|
||||
classificationPage.setPageWidth((float) pageInformation.width());
|
||||
classificationPage.setPageHeight((float) pageInformation.height());
|
||||
}
|
||||
|
||||
|
||||
private static void rotateDirAdjExactly(List<Word> words, PDPage pdPage) {
|
||||
|
||||
for (TextDirection dir : TextDirection.values()) {
|
||||
|
||||
@ -25,6 +25,7 @@ import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsi
|
||||
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.image.ImageServiceResponse;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableServiceResponse;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.VisualLayoutParsingResponse;
|
||||
import com.knecon.fforesight.service.ocr.v1.api.model.IdpResult;
|
||||
import com.knecon.fforesight.service.viewerdoc.service.ViewerDocVersioningUtility;
|
||||
import com.knecon.fforesight.tenantcommons.TenantContext;
|
||||
|
||||
@ -95,7 +96,23 @@ public class LayoutParsingStorageService {
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
|
||||
@SneakyThrows
|
||||
public IdpResult getIdpResultFile(String storageId) {
|
||||
|
||||
if (!storageService.objectExists(TenantContext.getTenantId(), storageId)) {
|
||||
return IdpResult.empty();
|
||||
}
|
||||
try (var idpResultStream = getObject(storageId)) {
|
||||
|
||||
IdpResult idpResult = objectMapper.readValue(idpResultStream, IdpResult.class);
|
||||
idpResultStream.close();
|
||||
return idpResult;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
public VisualLayoutParsingResponse getVisualLayoutParsingFile(String storageId) {
|
||||
|
||||
try (InputStream inputStream = getObject(storageId)) {
|
||||
|
||||
@ -1,9 +1,7 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.docstrum;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.EnumMap;
|
||||
import java.util.List;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
@ -16,10 +14,8 @@ import com.knecon.fforesight.service.layoutparser.processor.docstrum.service.Rea
|
||||
import com.knecon.fforesight.service.layoutparser.processor.docstrum.service.SpacingService;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.docstrum.service.ZoneBuilderService;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.RedTextPosition;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextDirection;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.Word;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.visualization.LayoutDebugLayer;
|
||||
|
||||
import lombok.RequiredArgsConstructor;
|
||||
|
||||
@ -27,7 +23,6 @@ import lombok.RequiredArgsConstructor;
|
||||
@RequiredArgsConstructor
|
||||
public class DocstrumSegmentationService {
|
||||
|
||||
public static final double SAME_DIRECTION_THRESHOLD = 0.9;
|
||||
private final NearestNeighbourService nearestNeighbourService;
|
||||
private final SpacingService spacingService;
|
||||
private final LineBuilderService lineBuilderService;
|
||||
@ -35,52 +30,27 @@ public class DocstrumSegmentationService {
|
||||
private final ReadingOrderService readingOrderService;
|
||||
|
||||
|
||||
public List<Zone> segmentPage(List<Word> textPositions, boolean xyOrder, CleanRulings usedRulings, LayoutDebugLayer visualizations) {
|
||||
public List<Zone> segmentPage(List<Word> words, boolean xyOrder, CleanRulings usedRulings) {
|
||||
|
||||
EnumMap<TextDirection, Integer> directionCounts = new EnumMap<>(TextDirection.class);
|
||||
|
||||
List<Zone> newZones = computeZones(textPositions, usedRulings, visualizations, TextDirection.ZERO);
|
||||
directionCounts.put(TextDirection.ZERO, newZones.size());
|
||||
List<Zone> newZones = computeZones(words, usedRulings, TextDirection.ZERO);
|
||||
List<Zone> zones = new ArrayList<>(newZones);
|
||||
|
||||
newZones = computeZones(textPositions, usedRulings, visualizations, TextDirection.QUARTER_CIRCLE);
|
||||
directionCounts.put(TextDirection.QUARTER_CIRCLE, newZones.size());
|
||||
newZones = computeZones(words, usedRulings, TextDirection.QUARTER_CIRCLE);
|
||||
zones.addAll(newZones);
|
||||
|
||||
newZones = computeZones(textPositions, usedRulings, visualizations, TextDirection.HALF_CIRCLE);
|
||||
directionCounts.put(TextDirection.HALF_CIRCLE, newZones.size());
|
||||
newZones = computeZones(words, usedRulings, TextDirection.HALF_CIRCLE);
|
||||
zones.addAll(newZones);
|
||||
|
||||
newZones = computeZones(textPositions, usedRulings, visualizations, TextDirection.THREE_QUARTER_CIRCLE);
|
||||
directionCounts.put(TextDirection.THREE_QUARTER_CIRCLE, newZones.size());
|
||||
newZones = computeZones(words, usedRulings, TextDirection.THREE_QUARTER_CIRCLE);
|
||||
zones.addAll(newZones);
|
||||
|
||||
return readingOrderService.resolve(zones, xyOrder, mostSameDirection(directionCounts));
|
||||
return readingOrderService.resolve(zones, xyOrder);
|
||||
}
|
||||
|
||||
|
||||
private boolean mostSameDirection(EnumMap<TextDirection, Integer> directionCounts) {
|
||||
private List<Zone> computeZones(List<Word> words, CleanRulings rulings, TextDirection direction) {
|
||||
|
||||
int total = directionCounts.values()
|
||||
.stream()
|
||||
.mapToInt(i -> i).sum();
|
||||
|
||||
if ((double) directionCounts.get(TextDirection.ZERO) / total > SAME_DIRECTION_THRESHOLD) {
|
||||
return true;
|
||||
} else if ((double) directionCounts.get(TextDirection.QUARTER_CIRCLE) / total > SAME_DIRECTION_THRESHOLD) {
|
||||
return true;
|
||||
} else if ((double) directionCounts.get(TextDirection.HALF_CIRCLE) / total > SAME_DIRECTION_THRESHOLD) {
|
||||
return true;
|
||||
} else if ((double) directionCounts.get(TextDirection.THREE_QUARTER_CIRCLE) / total > SAME_DIRECTION_THRESHOLD) {
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
private List<Zone> computeZones(List<Word> textPositions, CleanRulings rulings, LayoutDebugLayer visualizations, TextDirection direction) {
|
||||
|
||||
List<Character> characters = textPositions.stream()
|
||||
List<Character> characters = words.stream()
|
||||
.filter(t -> t.getDir() == direction)
|
||||
.map(Word::getCharacters)
|
||||
.flatMap(List::stream)
|
||||
|
||||
@ -1,7 +1,6 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.docstrum.model;
|
||||
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.util.Comparator;
|
||||
import java.util.List;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations;
|
||||
@ -25,8 +24,6 @@ public abstract class BoundingBox {
|
||||
// Also, these are definitely correct and should be used whenever possible.
|
||||
protected Rectangle2D bBoxPdf;
|
||||
|
||||
protected static final float VERTICAL_COMPARISON_THRESHOLD = 0.4f;
|
||||
|
||||
|
||||
public double getX() {
|
||||
|
||||
@ -204,23 +201,22 @@ public abstract class BoundingBox {
|
||||
}
|
||||
|
||||
|
||||
public double verticalOverlap(BoundingBox other) {
|
||||
public double verticalOverlapPdf(BoundingBox other) {
|
||||
|
||||
return Math.max(0, Math.min(this.getPdfMaxY(), other.getPdfMaxY()) - Math.max(this.getPdfMinY(), other.getPdfMinY()));
|
||||
}
|
||||
|
||||
|
||||
public static final Comparator<BoundingBox> ILL_DEFINED_ORDER = (o1, o2) -> {
|
||||
public double verticalOverlap(BoundingBox other) {
|
||||
|
||||
if (o1.equals(o2)) {
|
||||
return 0;
|
||||
}
|
||||
if (o1.verticalOverlap(o2) > VERTICAL_COMPARISON_THRESHOLD * ((o1.getHeight() + o2.getHeight()) / 2)) {
|
||||
return Double.compare(o1.getPdfMinX(), o2.getPdfMinX());
|
||||
} else {
|
||||
return Double.compare(o1.getPdfMaxY(), o2.getPdfMaxY());
|
||||
}
|
||||
};
|
||||
return Math.max(0, Math.min(this.getMaxY(), other.getMaxY()) - Math.max(this.getMinY(), other.getMinY()));
|
||||
}
|
||||
|
||||
|
||||
public double horizontalOverlap(BoundingBox other) {
|
||||
|
||||
return Math.max(0, Math.min(this.getMaxX(), other.getMaxX()) - Math.max(this.getMinX(), other.getMinX()));
|
||||
}
|
||||
|
||||
|
||||
public double horizontalDistance(BoundingBox other) {
|
||||
@ -276,4 +272,13 @@ public abstract class BoundingBox {
|
||||
return this.intersectsX(other) && this.getMinY() >= other.getMaxY();
|
||||
}
|
||||
|
||||
|
||||
public double intersectedArea(BoundingBox r2) {
|
||||
|
||||
double xOverlap = horizontalOverlap(r2);
|
||||
double yOverlap = verticalOverlap(r2);
|
||||
|
||||
return xOverlap * yOverlap;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@ -2,7 +2,9 @@ package com.knecon.fforesight.service.layoutparser.processor.docstrum.model;
|
||||
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
import java.util.Map;
|
||||
import java.util.Optional;
|
||||
import java.util.function.Function;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextDirection;
|
||||
@ -36,19 +38,16 @@ public abstract class TextBoundingBox extends BoundingBox {
|
||||
.map(TextBoundingBox::getBBoxDirAdj)
|
||||
.collect(RectangleTransformations.collectBBox());
|
||||
|
||||
Set<TextDirection> textDirections = components.stream()
|
||||
Optional<TextDirection> mostCommonDir = components.stream()
|
||||
.filter(c -> c instanceof TextBoundingBox)
|
||||
.map(c -> (TextBoundingBox) c)
|
||||
.map(TextBoundingBox::getDir)
|
||||
.collect(Collectors.toSet());
|
||||
.collect(Collectors.groupingBy(Function.identity(), Collectors.counting())).entrySet()
|
||||
.stream()
|
||||
.max(Map.Entry.comparingByValue())
|
||||
.map(Map.Entry::getKey);
|
||||
|
||||
if (textDirections.isEmpty()) {
|
||||
dir = TextDirection.ZERO;
|
||||
} else if (textDirections.size() > 1) {
|
||||
throw new IllegalArgumentException("More than one text direction found");
|
||||
} else {
|
||||
dir = textDirections.iterator().next();
|
||||
}
|
||||
dir = mostCommonDir.orElse(TextDirection.ZERO);
|
||||
}
|
||||
|
||||
|
||||
|
||||
@ -9,7 +9,7 @@ import org.springframework.stereotype.Service;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.AngleFilter;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.Character;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.Line;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.UnionFind;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.UnionFind;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings;
|
||||
|
||||
@Service
|
||||
|
||||
@ -3,6 +3,7 @@ package com.knecon.fforesight.service.layoutparser.processor.docstrum.service;
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Comparator;
|
||||
import java.util.EnumMap;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.ListIterator;
|
||||
@ -12,25 +13,43 @@ import java.util.stream.Collectors;
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.TextBoundingBox;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.Zone;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.docstrum.utils.DoubleUtils;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.TablePageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextDirection;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
|
||||
|
||||
@Service
|
||||
public class ReadingOrderService {
|
||||
|
||||
private static final double THRESHOLD = 5;
|
||||
public static final double MULTI_COLUMN_DETECTION_THRESHOLD = 1.5;
|
||||
public static final double SAME_DIRECTION_THRESHOLD = 0.9;
|
||||
|
||||
private static final Comparator<TextBoundingBox> COMPARATOR = //
|
||||
Comparator.comparing(TextBoundingBox::getY, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD))
|
||||
.thenComparing(TextBoundingBox::getX, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD));
|
||||
Comparator.comparing(TextBoundingBox::getY,
|
||||
(o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD))
|
||||
.thenComparing(TextBoundingBox::getX,
|
||||
(o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD));
|
||||
|
||||
private static final Comparator<TextBoundingBox> COMPARATOR_DIR_ADJ = //
|
||||
Comparator.comparing(TextBoundingBox::getYDirAdj, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD))
|
||||
Comparator.comparing(TextBoundingBox::getYDirAdj,
|
||||
(o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD))
|
||||
.thenComparing(TextBoundingBox::getXDirAdj, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD));
|
||||
|
||||
|
||||
public List<Zone> resolve(List<Zone> zones, boolean xyReadingOrder, boolean useDirAdjCoords) {
|
||||
public List<AbstractPageBlock> resolve(List<TextPageBlock> textBlocks, List<TablePageBlock> tables) {
|
||||
|
||||
List<AbstractPageBlock> unsortedBlocks = new ArrayList<>(textBlocks.size() + tables.size());
|
||||
unsortedBlocks.addAll(textBlocks);
|
||||
unsortedBlocks.addAll(tables);
|
||||
return resolve(unsortedBlocks, false);
|
||||
}
|
||||
|
||||
|
||||
public <T extends TextBoundingBox> List<T> resolve(List<T> zones, boolean xyReadingOrder) {
|
||||
|
||||
boolean useDirAdjCoords = mostSameDirection(zones);
|
||||
|
||||
if (zones.isEmpty() || zones.size() == 1) {
|
||||
return zones;
|
||||
@ -41,7 +60,7 @@ public class ReadingOrderService {
|
||||
}
|
||||
|
||||
Map<Long, Integer> histogram = new HashMap<>();
|
||||
for (Zone zone : zones) {
|
||||
for (TextBoundingBox zone : zones) {
|
||||
Rectangle2D bbox = useDirAdjCoords ? zone.getBBoxDirAdj() : zone.getBBox();
|
||||
long minY = Math.round(bbox.getMinY());
|
||||
long maxY = Math.round(bbox.getMaxY());
|
||||
@ -52,8 +71,7 @@ public class ReadingOrderService {
|
||||
|
||||
if (histogram.values()
|
||||
.stream()
|
||||
.mapToInt(Integer::intValue).average()
|
||||
.orElse(1) < MULTI_COLUMN_DETECTION_THRESHOLD) {
|
||||
.mapToInt(Integer::intValue).average().orElse(1) < MULTI_COLUMN_DETECTION_THRESHOLD) {
|
||||
return resolveSingleColumnReadingOrder(zones, useDirAdjCoords);
|
||||
} else {
|
||||
|
||||
@ -63,7 +81,7 @@ public class ReadingOrderService {
|
||||
}
|
||||
|
||||
|
||||
private static List<Zone> resolveSingleColumnReadingOrder(List<Zone> zones, boolean useDirAdjCoords) {
|
||||
private static <T extends TextBoundingBox> List<T> resolveSingleColumnReadingOrder(List<T> zones, boolean useDirAdjCoords) {
|
||||
|
||||
if (useDirAdjCoords) {
|
||||
return zones.stream()
|
||||
@ -71,7 +89,7 @@ public class ReadingOrderService {
|
||||
.stream()
|
||||
.flatMap(words -> words.stream()
|
||||
.sorted(COMPARATOR_DIR_ADJ))
|
||||
.toList();
|
||||
.collect(Collectors.toList());
|
||||
}
|
||||
|
||||
zones.sort(COMPARATOR);
|
||||
@ -79,7 +97,7 @@ public class ReadingOrderService {
|
||||
}
|
||||
|
||||
|
||||
private List<Zone> resolveMultiColumnReadingOder(List<Zone> zones, boolean useDirAdjCoords) {
|
||||
private <T extends TextBoundingBox> List<T> resolveMultiColumnReadingOder(List<T> zones, boolean useDirAdjCoords) {
|
||||
|
||||
// Simple reading order resolver for multi column page layout as described here : https://pub.towardsai.net/advanced-rag-02-unveiling-pdf-parsing-b84ae866344e
|
||||
// TODO implement a more fancy reading order resolver see https://github.com/BobLd/DocumentLayoutAnalysis/blob/master/README.md#reading-order
|
||||
@ -87,7 +105,7 @@ public class ReadingOrderService {
|
||||
double minX = Double.POSITIVE_INFINITY;
|
||||
double maxX = Double.NEGATIVE_INFINITY;
|
||||
|
||||
for (Zone zone : zones) {
|
||||
for (T zone : zones) {
|
||||
Rectangle2D bbox = useDirAdjCoords ? zone.getBBoxDirAdj() : zone.getBBox();
|
||||
if (bbox.getX() < minX) {
|
||||
minX = zone.getXDirAdj();
|
||||
@ -99,11 +117,11 @@ public class ReadingOrderService {
|
||||
|
||||
double midLineXCoordinate = (minX + maxX) / 2;
|
||||
|
||||
List<Zone> leftOf = new ArrayList<>();
|
||||
List<Zone> rightOf = new ArrayList<>();
|
||||
List<Zone> middle = new ArrayList<>();
|
||||
List<T> leftOf = new ArrayList<>();
|
||||
List<T> rightOf = new ArrayList<>();
|
||||
List<T> middle = new ArrayList<>();
|
||||
|
||||
for (Zone zone : zones) {
|
||||
for (T zone : zones) {
|
||||
Rectangle2D bbox = useDirAdjCoords ? zone.getBBoxDirAdj() : zone.getBBox();
|
||||
if (bbox.getX() < midLineXCoordinate && bbox.getX() + bbox.getWidth() < midLineXCoordinate) {
|
||||
leftOf.add(zone);
|
||||
@ -166,14 +184,14 @@ public class ReadingOrderService {
|
||||
middle.addAll(leftNotIntersecting);
|
||||
middle.addAll(rightNotIntersecting);
|
||||
*/
|
||||
List<Zone> sortedZones = new ArrayList<>();
|
||||
List<T> sortedZones = new ArrayList<>();
|
||||
sortedZones.addAll(leftOf);
|
||||
sortedZones.addAll(rightOf);
|
||||
|
||||
ListIterator<Zone> itty = middle.listIterator();
|
||||
ListIterator<T> itty = middle.listIterator();
|
||||
|
||||
while (itty.hasNext()) {
|
||||
Zone current = itty.next();
|
||||
T current = itty.next();
|
||||
Rectangle2D bbox = useDirAdjCoords ? current.getBBoxDirAdj() : current.getBBox();
|
||||
for (int i = 0; i < sortedZones.size(); i++) {
|
||||
if (bbox.getY() < sortedZones.get(i).getY()) {
|
||||
@ -189,4 +207,29 @@ public class ReadingOrderService {
|
||||
return sortedZones;
|
||||
}
|
||||
|
||||
|
||||
private boolean mostSameDirection(List<? extends TextBoundingBox> zones) {
|
||||
|
||||
EnumMap<TextDirection, Integer> directionCounts = new EnumMap<>(TextDirection.class);
|
||||
|
||||
for (TextBoundingBox zone : zones) {
|
||||
TextDirection dir = zone.getDir();
|
||||
directionCounts.put(dir, directionCounts.getOrDefault(dir, 0) + 1);
|
||||
}
|
||||
int total = directionCounts.values()
|
||||
.stream()
|
||||
.mapToInt(i -> i).sum();
|
||||
|
||||
if ((double) directionCounts.getOrDefault(TextDirection.ZERO, 0) / total > SAME_DIRECTION_THRESHOLD) {
|
||||
return true;
|
||||
} else if ((double) directionCounts.getOrDefault(TextDirection.QUARTER_CIRCLE, 0) / total > SAME_DIRECTION_THRESHOLD) {
|
||||
return true;
|
||||
} else if ((double) directionCounts.getOrDefault(TextDirection.HALF_CIRCLE, 0) / total > SAME_DIRECTION_THRESHOLD) {
|
||||
return true;
|
||||
} else if ((double) directionCounts.getOrDefault(TextDirection.THREE_QUARTER_CIRCLE, 0) / total > SAME_DIRECTION_THRESHOLD) {
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@ -10,7 +10,7 @@ import java.util.stream.Collectors;
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.Line;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.UnionFind;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.UnionFind;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.Zone;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.Word;
|
||||
|
||||
@ -1,4 +1,4 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.services;
|
||||
package com.knecon.fforesight.service.layoutparser.processor.experimental;
|
||||
|
||||
import java.awt.geom.Line2D;
|
||||
import java.awt.geom.Rectangle2D;
|
||||
@ -7,7 +7,6 @@ import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
import java.util.stream.Stream;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.GapInformation;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.PageContents;
|
||||
|
||||
import lombok.experimental.UtilityClass;
|
||||
@ -23,13 +22,13 @@ public class DividingColumnDetectionService {
|
||||
public List<Rectangle2D> detectColumns(PageContents pageContents) {
|
||||
|
||||
|
||||
if (pageContents.getSortedWords().size() < 2) {
|
||||
return List.of(pageContents.getCropBox());
|
||||
if (pageContents.getWords().size() < 2) {
|
||||
return List.of(pageContents.getPageInformation().cropBox());
|
||||
}
|
||||
|
||||
GapInformation linesWithGapInformation = GapDetectionService.findGapsInLines(pageContents.getSortedWords(), pageContents.getCropBox());
|
||||
GapInformation linesWithGapInformation = GapDetectionService.findGapsInLines(pageContents.getWords(), pageContents.getPageInformation().cropBox());
|
||||
|
||||
return detectColumnsFromLines(linesWithGapInformation.getXGaps(), pageContents.getCropBox());
|
||||
return detectColumnsFromLines(linesWithGapInformation.getXGaps(), pageContents.getPageInformation().cropBox());
|
||||
}
|
||||
|
||||
|
||||
@ -1,10 +1,9 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.services;
|
||||
package com.knecon.fforesight.service.layoutparser.processor.experimental;
|
||||
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.GapInformation;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.Word;
|
||||
|
||||
import lombok.AllArgsConstructor;
|
||||
@ -1,4 +1,4 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.model;
|
||||
package com.knecon.fforesight.service.layoutparser.processor.experimental;
|
||||
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.util.LinkedList;
|
||||
@ -1,4 +1,4 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.services;
|
||||
package com.knecon.fforesight.service.layoutparser.processor.experimental;
|
||||
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.awt.geom.RectangularShape;
|
||||
@ -6,9 +6,6 @@ import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
import java.util.Queue;
|
||||
import java.util.stream.Stream;
|
||||
import com.iqser.red.commons.jackson.ObjectMapperFactory;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.GapInformation;
|
||||
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Getter;
|
||||
@ -1,12 +1,10 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.services;
|
||||
package com.knecon.fforesight.service.layoutparser.processor.experimental;
|
||||
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.awt.geom.RectangularShape;
|
||||
import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.GapInformation;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.LineInformation;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.Word;
|
||||
|
||||
import lombok.experimental.UtilityClass;
|
||||
@ -1,11 +1,9 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.services;
|
||||
package com.knecon.fforesight.service.layoutparser.processor.experimental;
|
||||
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.GapInformation;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.LineInformation;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.Word;
|
||||
|
||||
@ -1,4 +1,4 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.model;
|
||||
package com.knecon.fforesight.service.layoutparser.processor.experimental;
|
||||
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.util.List;
|
||||
@ -1,12 +1,15 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.model;
|
||||
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
|
||||
import com.fasterxml.jackson.annotation.JsonIgnore;
|
||||
import com.iqser.red.service.redaction.v1.server.model.document.nodes.LayoutEngine;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.BoundingBox;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.TextBoundingBox;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.Word;
|
||||
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Data;
|
||||
@ -17,18 +20,18 @@ import lombok.NoArgsConstructor;
|
||||
@AllArgsConstructor
|
||||
@NoArgsConstructor
|
||||
@EqualsAndHashCode(callSuper = true)
|
||||
public abstract class AbstractPageBlock extends BoundingBox {
|
||||
public abstract class AbstractPageBlock extends TextBoundingBox {
|
||||
|
||||
@JsonIgnore
|
||||
protected PageBlockType classification;
|
||||
|
||||
Set<LayoutEngine> engines = new HashSet<>();
|
||||
protected Set<LayoutEngine> engines = new HashSet<>();
|
||||
|
||||
@JsonIgnore
|
||||
protected int page;
|
||||
|
||||
@JsonIgnore
|
||||
private Orientation orientation = Orientation.NONE;
|
||||
protected Orientation orientation = Orientation.NONE;
|
||||
|
||||
|
||||
public abstract String getText();
|
||||
@ -42,4 +45,6 @@ public abstract class AbstractPageBlock extends BoundingBox {
|
||||
|
||||
public abstract boolean isEmpty();
|
||||
|
||||
public abstract List<Word> getWords();
|
||||
|
||||
}
|
||||
|
||||
@ -5,6 +5,8 @@ import java.util.List;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.outline.OutlineObjectTree;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.outline.SectionTree;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.ClassificationFooter;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.ClassificationHeader;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.StringFrequencyCounter;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.UnclassifiedText;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.visualization.LayoutDebugLayer;
|
||||
|
||||
@ -1,5 +1,6 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.model;
|
||||
|
||||
import java.awt.geom.AffineTransform;
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashMap;
|
||||
@ -11,29 +12,38 @@ import com.knecon.fforesight.service.layoutparser.processor.model.image.Classifi
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.outline.OutlineObject;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.StringFrequencyCounter;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.CoordinateTransforms;
|
||||
|
||||
import lombok.Data;
|
||||
import lombok.NonNull;
|
||||
import lombok.RequiredArgsConstructor;
|
||||
|
||||
@Data
|
||||
@RequiredArgsConstructor
|
||||
|
||||
public class ClassificationPage {
|
||||
|
||||
public ClassificationPage(List<AbstractPageBlock> pageBlocks, PageInformation pageInformation, CleanRulings cleanRulings) {
|
||||
|
||||
this.cleanRulings = cleanRulings;
|
||||
this.pageNumber = pageInformation.number();
|
||||
this.textBlocks = pageBlocks;
|
||||
var mediaBox = pageInformation.mediabox();
|
||||
int rotation = pageInformation.rotationDegrees();
|
||||
this.landscape = mediaBox.getWidth() > mediaBox.getHeight() && (rotation == 0 || rotation == 180) //
|
||||
|| mediaBox.getHeight() > mediaBox.getWidth() && (rotation == 90 || rotation == 270);
|
||||
this.pageInformation = pageInformation;
|
||||
}
|
||||
|
||||
|
||||
private PageInformation pageInformation;
|
||||
@NonNull
|
||||
private List<AbstractPageBlock> textBlocks;
|
||||
|
||||
private List<OutlineObject> outlineObjects = new ArrayList<>();
|
||||
|
||||
private List<AbstractPageBlock> headlines = new ArrayList<>();
|
||||
|
||||
private List<ClassifiedImage> images = new ArrayList<>();
|
||||
|
||||
private Rectangle bodyTextFrame;
|
||||
|
||||
private boolean landscape;
|
||||
private int rotation;
|
||||
|
||||
private int pageNumber;
|
||||
|
||||
@ -42,11 +52,32 @@ public class ClassificationPage {
|
||||
private StringFrequencyCounter fontCounter = new StringFrequencyCounter();
|
||||
private StringFrequencyCounter fontStyleCounter = new StringFrequencyCounter();
|
||||
|
||||
private float pageWidth;
|
||||
private float pageHeight;
|
||||
|
||||
private CleanRulings cleanRulings;
|
||||
|
||||
private Map<String, List<Rectangle2D>> markedContentBboxPerType = new HashMap<>();
|
||||
|
||||
|
||||
public AffineTransform getPdfToPageTransform() {
|
||||
|
||||
return CoordinateTransforms.calculateInitialUserSpaceCoordsToPageCoords(getPageInformation());
|
||||
}
|
||||
|
||||
|
||||
public int getRotation() {
|
||||
|
||||
return pageInformation.rotationDegrees();
|
||||
}
|
||||
|
||||
|
||||
public float getPageWidth() {
|
||||
|
||||
return (float) pageInformation.width();
|
||||
}
|
||||
|
||||
|
||||
public float getPageHeight() {
|
||||
|
||||
return (float) pageInformation.height();
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@ -1,10 +1,10 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.model;
|
||||
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.util.List;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.Word;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.graphics.Box;
|
||||
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Builder;
|
||||
@ -15,8 +15,9 @@ import lombok.Getter;
|
||||
@AllArgsConstructor
|
||||
public class PageContents {
|
||||
|
||||
List<Word> sortedWords;
|
||||
Rectangle2D cropBox;
|
||||
Rectangle2D mediaBox;
|
||||
PageInformation pageInformation;
|
||||
List<Word> words;
|
||||
List<Ruling> rulings;
|
||||
List<Box> graphicBBoxes;
|
||||
|
||||
}
|
||||
|
||||
@ -2,16 +2,63 @@ package com.knecon.fforesight.service.layoutparser.processor.model;
|
||||
|
||||
import java.awt.geom.Rectangle2D;
|
||||
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Getter;
|
||||
import org.apache.pdfbox.pdmodel.PDPage;
|
||||
import org.apache.pdfbox.pdmodel.common.PDRectangle;
|
||||
|
||||
@Getter
|
||||
@AllArgsConstructor
|
||||
public class PageInformation {
|
||||
import com.iqser.red.service.redaction.v1.server.model.document.nodes.Page;
|
||||
|
||||
PageContents pageContents;
|
||||
LineInformation lineInformation;
|
||||
Rectangle2D mainBodyTextFrame;
|
||||
GapInformation gapInformation;
|
||||
public record PageInformation(Rectangle2D mediabox, Rectangle2D cropBox, int number, int rotationDegrees) {
|
||||
|
||||
}
|
||||
public static PageInformation fromPDPage(int pageNum, PDPage page) {
|
||||
|
||||
PDRectangle mediaBox = page.getMediaBox();
|
||||
PDRectangle cropBox = page.getCropBox();
|
||||
return new PageInformation(new Rectangle2D.Double(mediaBox.getLowerLeftX(), mediaBox.getLowerLeftY(), mediaBox.getWidth(), mediaBox.getHeight()),
|
||||
new Rectangle2D.Double(cropBox.getLowerLeftX(), cropBox.getLowerLeftY(), cropBox.getWidth(), cropBox.getHeight()),
|
||||
pageNum,
|
||||
page.getRotation());
|
||||
}
|
||||
|
||||
|
||||
public static PageInformation fromPage(Page page) {
|
||||
|
||||
return new PageInformation(new Rectangle2D.Double(0, 0, page.getWidth(), page.getHeight()),
|
||||
new Rectangle2D.Double(0, 0, page.getWidth(), page.getHeight()),
|
||||
page.getNumber(),
|
||||
page.getRotation());
|
||||
}
|
||||
|
||||
|
||||
public double height() {
|
||||
|
||||
return mediabox.getHeight();
|
||||
}
|
||||
|
||||
|
||||
public double heightRot() {
|
||||
|
||||
if (rotationDegrees == 90 || rotationDegrees == 270) {
|
||||
return width();
|
||||
}
|
||||
return height();
|
||||
}
|
||||
|
||||
|
||||
public double width() {
|
||||
|
||||
return mediabox.getWidth();
|
||||
}
|
||||
|
||||
|
||||
public double minX() {
|
||||
|
||||
return mediabox.getX();
|
||||
}
|
||||
|
||||
|
||||
public double minY() {
|
||||
|
||||
return mediabox.getY();
|
||||
}
|
||||
|
||||
}
|
||||
@ -4,6 +4,7 @@ import java.util.Collections;
|
||||
import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
import java.util.Locale;
|
||||
import java.util.Objects;
|
||||
import java.util.regex.Matcher;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
@ -77,7 +78,7 @@ public class SectionIdentifier {
|
||||
List<Integer> identifiers = new LinkedList<>();
|
||||
for (int i = 1; i <= 4; i++) {
|
||||
String numericalIdentifier = numericalIdentifierMatcher.group(i);
|
||||
if (numericalIdentifier == null || numericalIdentifier.equals("0") || numericalIdentifier.isEmpty() || numericalIdentifier.isBlank()) {
|
||||
if (numericalIdentifier == null || Objects.equals(numericalIdentifier, "0") || numericalIdentifier.isBlank()) {
|
||||
break;
|
||||
}
|
||||
identifiers.add(Integer.parseInt(numericalIdentifier.trim()));
|
||||
|
||||
@ -1,4 +1,4 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.docstrum.model;
|
||||
package com.knecon.fforesight.service.layoutparser.processor.model;
|
||||
|
||||
import java.util.Collection;
|
||||
import java.util.LinkedHashMap;
|
||||
@ -2,12 +2,14 @@ package com.knecon.fforesight.service.layoutparser.processor.model.outline;
|
||||
|
||||
import java.awt.geom.AffineTransform;
|
||||
import java.awt.geom.Point2D;
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.Locale;
|
||||
import java.util.Optional;
|
||||
|
||||
import org.apache.pdfbox.Loader;
|
||||
import org.apache.pdfbox.cos.COSArray;
|
||||
import org.apache.pdfbox.cos.COSBase;
|
||||
import org.apache.pdfbox.cos.COSDictionary;
|
||||
@ -28,7 +30,7 @@ import org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDOutlin
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.CoordinateTransforms;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.PageInformation;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.PageInformation;
|
||||
|
||||
import lombok.SneakyThrows;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
@ -48,19 +50,22 @@ public class OutlineExtractorService {
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
public OutlineObjectTree getOutlineObjectTree(PDDocument document) {
|
||||
public OutlineObjectTree getOutlineObjectTree(File documentFile) {
|
||||
|
||||
PDDocumentOutline documentOutline = document.getDocumentCatalog().getDocumentOutline();
|
||||
try (var document = Loader.loadPDF(documentFile)) {
|
||||
|
||||
List<OutlineObjectTreeNode> rootNodes = new ArrayList<>();
|
||||
if (documentOutline != null) {
|
||||
for (PDOutlineItem child : documentOutline.children()) {
|
||||
Optional<OutlineObjectTreeNode> outlineObjectWithChildren = createOutlineObjectWithChildren(child, document, 1);
|
||||
outlineObjectWithChildren.ifPresent(rootNodes::add);
|
||||
PDDocumentOutline documentOutline = document.getDocumentCatalog().getDocumentOutline();
|
||||
|
||||
List<OutlineObjectTreeNode> rootNodes = new ArrayList<>();
|
||||
if (documentOutline != null) {
|
||||
for (PDOutlineItem child : documentOutline.children()) {
|
||||
Optional<OutlineObjectTreeNode> outlineObjectWithChildren = createOutlineObjectWithChildren(child, document, 1);
|
||||
outlineObjectWithChildren.ifPresent(rootNodes::add);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return new OutlineObjectTree(rootNodes);
|
||||
return new OutlineObjectTree(rootNodes);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@ -128,9 +133,7 @@ public class OutlineExtractorService {
|
||||
log.info(String.format("Error occurred during position resolution for outline item on page %s with title %s: " + e, pageNumber, title));
|
||||
}
|
||||
|
||||
return Optional.of(new OutlineObjectTreeNode(new OutlineObject(title,
|
||||
pageNumber,
|
||||
transformPointToPageCoords(outlinePosition, userSpaceToPageCoords), depth)));
|
||||
return Optional.of(new OutlineObjectTreeNode(new OutlineObject(title, pageNumber, transformPointToPageCoords(outlinePosition, userSpaceToPageCoords), depth)));
|
||||
}
|
||||
|
||||
|
||||
|
||||
@ -10,8 +10,8 @@ import org.springframework.stereotype.Service;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationDocument;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationFooter;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationHeader;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.ClassificationFooter;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.ClassificationHeader;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.PageBlockType;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.image.ClassifiedImage;
|
||||
|
||||
@ -4,13 +4,15 @@ import java.awt.geom.AffineTransform;
|
||||
import java.awt.geom.Point2D;
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Iterator;
|
||||
import java.util.Collection;
|
||||
import java.util.List;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.BoundingBox;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.Word;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.TextNormalizationUtilities;
|
||||
import com.knecon.fforesight.service.ocr.v1.api.model.TableCell;
|
||||
|
||||
import lombok.Data;
|
||||
import lombok.EqualsAndHashCode;
|
||||
@ -22,7 +24,7 @@ import lombok.NoArgsConstructor;
|
||||
@NoArgsConstructor
|
||||
public class Cell extends BoundingBox {
|
||||
|
||||
private List<TextPageBlock> textBlocks = new ArrayList<>();
|
||||
private List<AbstractPageBlock> textBlocks = new ArrayList<>();
|
||||
|
||||
private List<Cell> headerCells = new ArrayList<>();
|
||||
|
||||
@ -33,17 +35,41 @@ public class Cell extends BoundingBox {
|
||||
private int pageNumber;
|
||||
|
||||
|
||||
public Cell(Point2D topLeft, Point2D bottomRight) {
|
||||
public Cell(Point2D topLeft, Point2D bottomRight, AffineTransform pdfToPageTransform) {
|
||||
|
||||
this.bBoxPdf = new Rectangle2D.Double(topLeft.getX(), topLeft.getY(), (bottomRight.getX() - topLeft.getX()), (bottomRight.getY() - topLeft.getY()));
|
||||
this.bBox = bBoxPdf;
|
||||
this.bBox = RectangleTransformations.transform(bBoxPdf, pdfToPageTransform);
|
||||
}
|
||||
|
||||
|
||||
public Cell(Rectangle2D bBoxInitialUserSpace, AffineTransform initialUserSpaceToJava) {
|
||||
public static Cell fromPageCoordinates(Point2D topLeft, Point2D bottomRight, AffineTransform pageToPdfTransform) {
|
||||
|
||||
var bBox = new Rectangle2D.Double(topLeft.getX(), topLeft.getY(), (bottomRight.getX() - topLeft.getX()), (bottomRight.getY() - topLeft.getY()));
|
||||
return fromPageCoordinates(bBox, pageToPdfTransform);
|
||||
}
|
||||
|
||||
|
||||
public static Cell fromPageCoordinates(Rectangle2D r, AffineTransform pageToPdfTransform) {
|
||||
|
||||
Cell cell = new Cell();
|
||||
var bBoxPdf = RectangleTransformations.transform(r, pageToPdfTransform);
|
||||
cell.bBox = r;
|
||||
cell.bBoxPdf = bBoxPdf;
|
||||
return cell;
|
||||
}
|
||||
|
||||
|
||||
public Cell(TableCell tableCell, AffineTransform pdfToPageTransform) {
|
||||
|
||||
this.bBoxPdf = tableCell.textRegion().region().bbox().get().getBounds2D();
|
||||
this.bBox = RectangleTransformations.transform(bBoxPdf, pdfToPageTransform);
|
||||
}
|
||||
|
||||
|
||||
public Cell(Rectangle2D bBoxInitialUserSpace, AffineTransform pdfToPageTransform) {
|
||||
|
||||
this.bBoxPdf = bBoxInitialUserSpace;
|
||||
this.bBox = initialUserSpaceToJava.createTransformedShape(bBoxInitialUserSpace).getBounds2D();
|
||||
this.bBox = RectangleTransformations.transform(bBoxPdf, pdfToPageTransform);
|
||||
}
|
||||
|
||||
|
||||
@ -56,9 +82,12 @@ public class Cell extends BoundingBox {
|
||||
}
|
||||
|
||||
|
||||
public void addTextBlock(TextPageBlock textBlock) {
|
||||
public List<Word> getWords() {
|
||||
|
||||
textBlocks.add(textBlock);
|
||||
return getTextBlocks().stream()
|
||||
.map(AbstractPageBlock::getWords)
|
||||
.flatMap(Collection::stream)
|
||||
.toList();
|
||||
}
|
||||
|
||||
|
||||
@ -67,24 +96,12 @@ public class Cell extends BoundingBox {
|
||||
|
||||
StringBuilder sb = new StringBuilder();
|
||||
|
||||
Iterator<TextPageBlock> itty = textBlocks.iterator();
|
||||
Word previous = null;
|
||||
while (itty.hasNext()) {
|
||||
|
||||
TextPageBlock textBlock = itty.next();
|
||||
|
||||
for (Word word : textBlock.getWords()) {
|
||||
if (previous != null) {
|
||||
if (Math.abs(previous.getMaxYDirAdj() - word.getMaxYDirAdj()) > word.getTextHeight()) {
|
||||
sb.append('\n');
|
||||
} else {
|
||||
sb.append(' ');
|
||||
}
|
||||
}
|
||||
sb.append(word.toString());
|
||||
previous = word;
|
||||
for (int i = 0; i < textBlocks.size(); i++) {
|
||||
AbstractPageBlock textBlock = textBlocks.get(i);
|
||||
sb.append(textBlock);
|
||||
if (i < textBlocks.size() - 1) {
|
||||
sb.append("\n");
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
return TextNormalizationUtilities.cleanString(sb.toString());
|
||||
|
||||
@ -22,6 +22,12 @@ public class CleanRulings {
|
||||
List<Ruling> verticals; // unmodifiable sorted by X list
|
||||
|
||||
|
||||
public static CleanRulings empty() {
|
||||
|
||||
return new CleanRulings(Collections.emptyList(), Collections.emptyList());
|
||||
}
|
||||
|
||||
|
||||
public CleanRulings(List<Ruling> horizontals, List<Ruling> verticals) {
|
||||
|
||||
this.horizontals = horizontals.stream()
|
||||
|
||||
@ -0,0 +1,334 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.model.table;
|
||||
|
||||
import java.awt.geom.Line2D;
|
||||
import java.awt.geom.Point2D;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collection;
|
||||
import java.util.Comparator;
|
||||
import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
import java.util.Optional;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.Word;
|
||||
|
||||
import lombok.AccessLevel;
|
||||
import lombok.Getter;
|
||||
import lombok.Setter;
|
||||
import lombok.experimental.FieldDefaults;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
|
||||
@Slf4j
|
||||
@Getter
|
||||
@FieldDefaults(level = AccessLevel.PRIVATE)
|
||||
public class LinkedQuadPointCell {
|
||||
|
||||
public static final int MAX_NEIGHBOUR_DISTANCE = 2;
|
||||
public static final int MAX_ANGLE_DIFFERENCE = 5;
|
||||
public static final double LINE_INTERSECT_THRESHOLD = 2;
|
||||
|
||||
final QuadPoint quadPoint;
|
||||
final List<AbstractPageBlock> pageBlocks;
|
||||
final List<LinkedQuadPointCell> rights = new ArrayList<>();
|
||||
final List<LinkedQuadPointCell> lefts = new ArrayList<>();
|
||||
final List<LinkedQuadPointCell> aboves = new ArrayList<>();
|
||||
final List<LinkedQuadPointCell> belows = new ArrayList<>();
|
||||
|
||||
@Setter
|
||||
boolean headerCell;
|
||||
|
||||
|
||||
public LinkedQuadPointCell(QuadPoint quadPoint, List<AbstractPageBlock> pageBlocks) {
|
||||
|
||||
this.quadPoint = quadPoint;
|
||||
this.pageBlocks = pageBlocks;
|
||||
headerCell = false;
|
||||
}
|
||||
|
||||
|
||||
public boolean contains(Word word) {
|
||||
|
||||
return quadPoint.contains(word.getBBox().getCenterX(), word.getBBox().getCenterY());
|
||||
}
|
||||
|
||||
|
||||
public String toString() {
|
||||
|
||||
return getPageBlocks().stream()
|
||||
.map(AbstractPageBlock::toString)
|
||||
.collect(Collectors.joining("\n"));
|
||||
}
|
||||
|
||||
|
||||
public void addToNeighbours(LinkedQuadPointCell otherCell, double minWidth, double minHeight) {
|
||||
|
||||
if (rightNeighbour(otherCell, minHeight)) {
|
||||
rights.add(otherCell);
|
||||
}
|
||||
if (leftNeighbour(otherCell, minHeight)) {
|
||||
lefts.add(otherCell);
|
||||
}
|
||||
if (aboveNeighbour(otherCell, minWidth)) {
|
||||
aboves.add(otherCell);
|
||||
}
|
||||
if (belowNeighbour(otherCell, minWidth)) {
|
||||
belows.add(otherCell);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
public boolean leftNeighbour(LinkedQuadPointCell other, double minHeight) {
|
||||
|
||||
Line2D right = this.quadPoint.getLeftLine();
|
||||
Line2D left = other.quadPoint.getRightLine();
|
||||
return isYIntersectionSignificant(right, left, minHeight) && areLinesSimilar(right, left);
|
||||
}
|
||||
|
||||
|
||||
public boolean rightNeighbour(LinkedQuadPointCell other, double minHeight) {
|
||||
|
||||
Line2D right = other.quadPoint.getLeftLine();
|
||||
Line2D left = this.quadPoint.getRightLine();
|
||||
return isYIntersectionSignificant(right, left, minHeight) && areLinesSimilar(right, left);
|
||||
}
|
||||
|
||||
|
||||
public boolean aboveNeighbour(LinkedQuadPointCell other, double minWidth) {
|
||||
|
||||
Line2D top = other.quadPoint.getTopLine();
|
||||
Line2D bottom = this.quadPoint.getBottomLine();
|
||||
return isXIntersectionSignificant(top, bottom, minWidth) && areLinesSimilar(top, bottom);
|
||||
}
|
||||
|
||||
|
||||
public boolean belowNeighbour(LinkedQuadPointCell other, double minWidth) {
|
||||
|
||||
Line2D top = this.quadPoint.getTopLine();
|
||||
Line2D bottom = other.quadPoint.getBottomLine();
|
||||
return isXIntersectionSignificant(top, bottom, minWidth) && areLinesSimilar(top, bottom);
|
||||
}
|
||||
|
||||
|
||||
public static boolean areLinesSimilar(Line2D line1, Line2D line2) {
|
||||
|
||||
double angle1 = Math.atan2(line1.getY2() - line1.getY1(), line1.getX2() - line1.getX1());
|
||||
double angle2 = Math.atan2(line2.getY2() - line2.getY1(), line2.getX2() - line2.getX1());
|
||||
|
||||
double angleDifference = Math.toDegrees(Math.abs(angle1 - angle2));
|
||||
|
||||
angleDifference = Math.min(angleDifference, 360 - angleDifference);
|
||||
|
||||
if (angleDifference >= MAX_ANGLE_DIFFERENCE) {
|
||||
return false;
|
||||
}
|
||||
|
||||
double distance1 = line1.ptSegDist(line2.getP1());
|
||||
double distance2 = line1.ptSegDist(line2.getP2());
|
||||
double distance3 = line2.ptSegDist(line1.getP1());
|
||||
double distance4 = line2.ptSegDist(line1.getP2());
|
||||
|
||||
double minDistance = Math.min(Math.min(distance1, distance2), Math.min(distance3, distance4));
|
||||
|
||||
return minDistance < MAX_NEIGHBOUR_DISTANCE;
|
||||
}
|
||||
|
||||
|
||||
public static boolean isXIntersectionSignificant(Line2D line1, Line2D line2, double minWidth) {
|
||||
|
||||
double start1 = Math.min(line1.getX1(), line1.getX2());
|
||||
double end1 = Math.max(line1.getX1(), line1.getX2());
|
||||
double start2 = Math.min(line2.getX1(), line2.getX2());
|
||||
double end2 = Math.max(line2.getX1(), line2.getX2());
|
||||
double intersectionStart = Math.max(start1, start2);
|
||||
double intersectionEnd = Math.min(end1, end2);
|
||||
return intersectionEnd - intersectionStart >= minWidth;
|
||||
}
|
||||
|
||||
|
||||
public static boolean isYIntersectionSignificant(Line2D line1, Line2D line2, double minHeight) {
|
||||
|
||||
double start1 = Math.min(line1.getY1(), line1.getY2());
|
||||
double end1 = Math.max(line1.getY1(), line1.getY2());
|
||||
double start2 = Math.min(line2.getY1(), line2.getY2());
|
||||
double end2 = Math.max(line2.getY1(), line2.getY2());
|
||||
double intersectionStart = Math.max(start1, start2);
|
||||
double intersectionEnd = Math.min(end1, end2);
|
||||
return intersectionEnd - intersectionStart >= minHeight;
|
||||
}
|
||||
|
||||
|
||||
public void resetNeighbours() {
|
||||
|
||||
rights.clear();
|
||||
lefts.clear();
|
||||
aboves.clear();
|
||||
belows.clear();
|
||||
}
|
||||
|
||||
|
||||
public boolean needsSplit() {
|
||||
|
||||
return rights.size() > 1 || lefts.size() > 1 || aboves.size() > 1 || belows.size() > 1;
|
||||
}
|
||||
|
||||
|
||||
private LinkedQuadPointCell copyCell(Point2D a, Point2D b, Point2D c, Point2D d) {
|
||||
|
||||
var cell = new LinkedQuadPointCell(new QuadPoint(a, b, c, d), pageBlocks);
|
||||
cell.setHeaderCell(headerCell);
|
||||
return cell;
|
||||
}
|
||||
|
||||
|
||||
public boolean isTopLeft() {
|
||||
|
||||
return lefts.isEmpty() && aboves.isEmpty();
|
||||
}
|
||||
|
||||
|
||||
public Collection<LinkedQuadPointCell> split(double minWidth, double minHeight) {
|
||||
|
||||
List<LinkedQuadPointCell> newCells;
|
||||
if (rights.size() > 1 && rights.size() >= lefts.size()) {
|
||||
newCells = splitY(rights, minHeight);
|
||||
return newCells;
|
||||
}
|
||||
if (lefts.size() > 1) {
|
||||
newCells = splitY(lefts, minHeight);
|
||||
return newCells;
|
||||
}
|
||||
if (aboves.size() > 1 && aboves.size() >= belows.size()) {
|
||||
newCells = splitX(aboves, minWidth);
|
||||
return newCells;
|
||||
}
|
||||
if (belows.size() > 1) {
|
||||
newCells = splitX(belows, minWidth);
|
||||
return newCells;
|
||||
}
|
||||
return List.of(this);
|
||||
}
|
||||
|
||||
|
||||
private List<LinkedQuadPointCell> splitY(List<LinkedQuadPointCell> neighbours, double minHeight) {
|
||||
|
||||
List<LinkedQuadPointCell> splitCells = new LinkedList<>();
|
||||
List<Line2D> ySplitLines = neighbours.stream()
|
||||
.map(LinkedQuadPointCell::getQuadPoint)
|
||||
.map(QuadPoint::getTopLine)
|
||||
.sorted(Comparator.comparing(line -> (line.getY1() + line.getY2()) / 2))
|
||||
.toList();
|
||||
Line2D rightLine = quadPoint.getRightLine();
|
||||
Line2D leftLine = quadPoint.getLeftLine();
|
||||
Line2D topLine = quadPoint.getTopLine();
|
||||
Point2D lowerLeft = quadPoint.getLowerLeft();
|
||||
Point2D lowerRight = quadPoint.getLowerRight();
|
||||
Point2D topLeft;
|
||||
Point2D topRight;
|
||||
for (Line2D neighborLine : ySplitLines) {
|
||||
if (Math.abs(neighborLine.getY1() - topLine.getY1()) < minHeight || Math.abs(neighborLine.getY2() - topLine.getY2()) < minHeight) {
|
||||
continue;
|
||||
}
|
||||
var topLeftOptional = findIntersectionPoint(leftLine, neighborLine);
|
||||
var lowerRightOptional = findIntersectionPoint(rightLine, neighborLine);
|
||||
if (topLeftOptional.isEmpty() || lowerRightOptional.isEmpty()) {
|
||||
continue;
|
||||
}
|
||||
topLeft = topLeftOptional.get();
|
||||
topRight = lowerRightOptional.get();
|
||||
if (Math.abs(topLeft.getY() - lowerLeft.getY()) < minHeight || Math.abs(topRight.getY() - lowerRight.getY()) < minHeight) {
|
||||
continue;
|
||||
}
|
||||
LinkedQuadPointCell cell = copyCell(topLeft, lowerLeft, lowerRight, topRight);
|
||||
splitCells.add(cell);
|
||||
lowerLeft = topLeft;
|
||||
lowerRight = topRight;
|
||||
}
|
||||
LinkedQuadPointCell cell = copyCell(topLine.getP1(), lowerLeft, lowerRight, topLine.getP2());
|
||||
splitCells.add(cell);
|
||||
return splitCells;
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
Finds the intersection point of the line and the extended line. Where the intersectionPoint must lie within the range of the line, but the extendedLine may be extended as far as needed.
|
||||
*/
|
||||
private Optional<Point2D> findIntersectionPoint(Line2D line, Line2D lineToExtend) {
|
||||
|
||||
double x1 = line.getX1();
|
||||
double y1 = line.getY1();
|
||||
double x2 = line.getX2();
|
||||
double y2 = line.getY2();
|
||||
|
||||
double x3 = lineToExtend.getX1();
|
||||
double y3 = lineToExtend.getY1();
|
||||
double x4 = lineToExtend.getX2();
|
||||
double y4 = lineToExtend.getY2();
|
||||
|
||||
double denom = (x1 - x2) * (y3 - y4) - (y1 - y2) * (x3 - x4);
|
||||
|
||||
// If denominator is 0, lines are parallel or coincident
|
||||
if (denom == 0) {
|
||||
return Optional.empty();
|
||||
}
|
||||
|
||||
double intersectX = ((x1 * y2 - y1 * x2) * (x3 - x4) - (x1 - x2) * (x3 * y4 - y3 * x4)) / denom;
|
||||
double intersectY = ((x1 * y2 - y1 * x2) * (y3 - y4) - (y1 - y2) * (x3 * y4 - y3 * x4)) / denom;
|
||||
|
||||
Point2D intersection = new Point2D.Double(intersectX, intersectY);
|
||||
|
||||
// Check if the intersection point lies within the bounds of the line segment
|
||||
if (intersection.getX() >= Math.min(x1 - LINE_INTERSECT_THRESHOLD, x2 - LINE_INTERSECT_THRESHOLD) && intersection.getX() <= Math.max(x1 + LINE_INTERSECT_THRESHOLD,
|
||||
x2 + LINE_INTERSECT_THRESHOLD)//
|
||||
&& intersection.getY() >= Math.min(y1 - LINE_INTERSECT_THRESHOLD, y2 - LINE_INTERSECT_THRESHOLD) && intersection.getY() <= Math.max(y1 + LINE_INTERSECT_THRESHOLD,
|
||||
y2 + LINE_INTERSECT_THRESHOLD)) {
|
||||
return Optional.of(intersection);
|
||||
}
|
||||
|
||||
return Optional.empty();
|
||||
}
|
||||
|
||||
|
||||
private List<LinkedQuadPointCell> splitX(List<LinkedQuadPointCell> neighbours, double minWidth) {
|
||||
|
||||
List<Line2D> xSplitLines = neighbours.stream()
|
||||
.map(LinkedQuadPointCell::getQuadPoint)
|
||||
.map(QuadPoint::getRightLine)
|
||||
.sorted(Comparator.comparing(line -> (line.getX1() + line.getX2()) / 2))
|
||||
.toList();
|
||||
if (xSplitLines.isEmpty()) {
|
||||
return List.of(this);
|
||||
}
|
||||
List<LinkedQuadPointCell> splitCells = new LinkedList<>();
|
||||
Line2D topLine = quadPoint.getTopLine();
|
||||
Line2D bottomLine = quadPoint.getBottomLine();
|
||||
Line2D rightLine = quadPoint.getRightLine();
|
||||
Point2D topLeft = quadPoint.getTopLeft();
|
||||
Point2D lowerLeft = quadPoint.getLowerLeft();
|
||||
Point2D topRight;
|
||||
Point2D lowerRight;
|
||||
for (Line2D neighborLine : xSplitLines) {
|
||||
if (Math.abs(rightLine.getX1() - neighborLine.getX1()) < minWidth || Math.abs(rightLine.getX2() - neighborLine.getX2()) < minWidth) {
|
||||
continue;
|
||||
}
|
||||
var topRightOptional = findIntersectionPoint(topLine, neighborLine);
|
||||
var lowerRightOptional = findIntersectionPoint(bottomLine, neighborLine);
|
||||
if (topRightOptional.isEmpty() || lowerRightOptional.isEmpty()) {
|
||||
continue;
|
||||
}
|
||||
topRight = topRightOptional.get();
|
||||
lowerRight = lowerRightOptional.get();
|
||||
if (Math.abs(topRight.getX() - topLeft.getX()) < minWidth || Math.abs(lowerRight.getX() - lowerLeft.getX()) < minWidth) {
|
||||
continue;
|
||||
}
|
||||
LinkedQuadPointCell cell = copyCell(lowerLeft, topLeft, topRight, lowerRight);
|
||||
topLeft = topRight;
|
||||
lowerLeft = lowerRight;
|
||||
splitCells.add(cell);
|
||||
}
|
||||
LinkedQuadPointCell cell = copyCell(lowerLeft, topLeft, rightLine.getP1(), rightLine.getP2());
|
||||
splitCells.add(cell);
|
||||
return splitCells;
|
||||
}
|
||||
|
||||
}
|
||||
@ -0,0 +1,291 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.model.table;
|
||||
|
||||
import java.awt.geom.AffineTransform;
|
||||
import java.awt.geom.Line2D;
|
||||
import java.awt.geom.Point2D;
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Comparator;
|
||||
import java.util.List;
|
||||
import java.util.Objects;
|
||||
import java.util.stream.Stream;
|
||||
|
||||
import com.knecon.fforesight.service.ocr.v1.api.model.QuadPointData;
|
||||
|
||||
import lombok.Getter;
|
||||
|
||||
public final class QuadPoint {
|
||||
|
||||
/*
|
||||
B _____ C
|
||||
| |
|
||||
A|_____|D
|
||||
*/
|
||||
|
||||
@Getter
|
||||
private final Point2D a;
|
||||
@Getter
|
||||
private final Point2D b;
|
||||
@Getter
|
||||
private final Point2D c;
|
||||
@Getter
|
||||
private final Point2D d;
|
||||
|
||||
private Line2D left;
|
||||
private Line2D right;
|
||||
private Line2D top;
|
||||
private Line2D bottom;
|
||||
|
||||
|
||||
// This constructor assumes, the points form a convex polygon, I will omit the assertion for performance reasons.
|
||||
public QuadPoint(Point2D a, Point2D b, Point2D c, Point2D d) {
|
||||
|
||||
List<Point2D> points = new ArrayList<>(4);
|
||||
points.add(a);
|
||||
points.add(b);
|
||||
points.add(c);
|
||||
points.add(d);
|
||||
points.sort(Comparator.comparingDouble(Point2D::getX).thenComparing(Point2D::getY));
|
||||
if (points.get(0).getY() >= points.get(1).getY()) {
|
||||
this.a = points.get(0);
|
||||
this.b = points.get(1);
|
||||
} else {
|
||||
this.a = points.get(1);
|
||||
this.b = points.get(0);
|
||||
}
|
||||
|
||||
if (points.get(2).getY() < points.get(3).getY()) {
|
||||
this.c = points.get(2);
|
||||
this.d = points.get(3);
|
||||
} else {
|
||||
this.c = points.get(3);
|
||||
this.d = points.get(2);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
public static QuadPoint fromRectangle2D(Rectangle2D rectangle2D) {
|
||||
|
||||
var lowerLeft = new Point2D.Double(rectangle2D.getX(), rectangle2D.getY());
|
||||
var upperLeft = new Point2D.Double(rectangle2D.getX(), rectangle2D.getMaxY());
|
||||
var upperRight = new Point2D.Double(rectangle2D.getMaxX(), rectangle2D.getMaxY());
|
||||
var lowerRight = new Point2D.Double(rectangle2D.getMaxX(), rectangle2D.getY());
|
||||
|
||||
return new QuadPoint(lowerLeft, upperLeft, upperRight, lowerRight);
|
||||
}
|
||||
|
||||
|
||||
public Rectangle2D getBounds2D() {
|
||||
|
||||
double minX = Math.min(Math.min(Math.min(a.getX(), b.getX()), c.getX()), d.getX());
|
||||
double minY = Math.min(Math.min(Math.min(a.getY(), b.getY()), c.getY()), d.getY());
|
||||
double maxX = Math.max(Math.max(Math.max(a.getX(), b.getX()), c.getX()), d.getX());
|
||||
double maxY = Math.max(Math.max(Math.max(a.getY(), b.getY()), c.getY()), d.getY());
|
||||
|
||||
return new Rectangle2D.Double(minX, minY, maxX - minX, maxY - minY);
|
||||
}
|
||||
|
||||
|
||||
public static QuadPoint fromData(QuadPointData data) {
|
||||
|
||||
return new QuadPoint(new Point2D.Double(data.values()[0], data.values()[1]),
|
||||
new Point2D.Double(data.values()[2], data.values()[3]),
|
||||
new Point2D.Double(data.values()[4], data.values()[5]),
|
||||
new Point2D.Double(data.values()[6], data.values()[7]));
|
||||
|
||||
}
|
||||
|
||||
|
||||
public Stream<Line2D> asLines() {
|
||||
|
||||
return Stream.of(new Line2D.Double(getA(), getB()), new Line2D.Double(getB(), getC()), new Line2D.Double(getC(), getD()), new Line2D.Double(getD(), getA()));
|
||||
|
||||
}
|
||||
|
||||
|
||||
public QuadPoint getTransformed(AffineTransform at) {
|
||||
|
||||
return new QuadPoint(at.transform(a, null), at.transform(b, null), at.transform(c, null), at.transform(d, null));
|
||||
}
|
||||
|
||||
|
||||
public boolean contains(double x, double y) {
|
||||
// split into two triangles, test if either contains the point, assumes the QuadPoint is convex and created correctly. More specifically, the points must be in the correct order.
|
||||
return triangleContains(a, b, c, x, y) || triangleContains(a, c, d, x, y);
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
checks if a triangle contains a point by converting the point to barycentric coordinates using cramer's rule and then checking if the linear combination is within the bounds of the triangle.
|
||||
https://en.wikipedia.org/wiki/Barycentric_coordinate_system#Barycentric_coordinates_on_triangles
|
||||
*/
|
||||
private boolean triangleContains(Point2D a, Point2D b, Point2D c, double x, double y) {
|
||||
|
||||
// area of the triangle
|
||||
double denominator = ((b.getY() - c.getY()) * (a.getX() - c.getX()) + (c.getX() - b.getX()) * (a.getY() - c.getY()));
|
||||
double invertedDenominator = 1.0 / denominator;
|
||||
double alpha = ((b.getY() - c.getY()) * (x - c.getX()) + (c.getX() - b.getX()) * (y - c.getY())) * invertedDenominator;
|
||||
double beta = ((c.getY() - a.getY()) * (x - c.getX()) + (a.getX() - c.getX()) * (y - c.getY())) * invertedDenominator;
|
||||
|
||||
return alpha >= 0 && beta >= 0 && alpha + beta <= 1;
|
||||
}
|
||||
|
||||
|
||||
public boolean contains(Point2D p) {
|
||||
|
||||
return contains(p.getX(), p.getY());
|
||||
}
|
||||
|
||||
|
||||
public boolean contains(Rectangle2D r) {
|
||||
|
||||
double x = r.getX();
|
||||
double y = r.getY();
|
||||
double maxY = r.getMaxY();
|
||||
double maxX = r.getMaxX();
|
||||
|
||||
Point2D p1 = new Point2D.Double(x, y);
|
||||
Point2D p2 = new Point2D.Double(x, maxY);
|
||||
Point2D p3 = new Point2D.Double(maxX, maxY);
|
||||
Point2D p4 = new Point2D.Double(maxX, y);
|
||||
|
||||
return contains(p1) && contains(p2) && contains(p3) && contains(p4);
|
||||
}
|
||||
|
||||
|
||||
public double getCenterX() {
|
||||
|
||||
return (a.getX() + b.getX() + c.getX() + d.getX()) / 4;
|
||||
}
|
||||
|
||||
|
||||
public double getCenterY() {
|
||||
|
||||
return (a.getY() + b.getY() + c.getY() + d.getY()) / 4;
|
||||
}
|
||||
|
||||
|
||||
public Point2D getCenter() {
|
||||
|
||||
return new Point2D.Double(getCenterX(), getCenterY());
|
||||
}
|
||||
|
||||
|
||||
public boolean intersects(Line2D line) {
|
||||
|
||||
return contains(line.getP1()) || contains(line.getP2()) || asLines().anyMatch(qLine -> qLine.intersectsLine(line));
|
||||
}
|
||||
|
||||
|
||||
public Line2D getRightLine() {
|
||||
|
||||
if (right == null) {
|
||||
right = new Line2D.Double(getLowerRight(), getTopRight());
|
||||
}
|
||||
|
||||
return right;
|
||||
}
|
||||
|
||||
|
||||
public Line2D getLeftLine() {
|
||||
|
||||
if (left == null) {
|
||||
left = new Line2D.Double(getLowerLeft(), getTopLeft());
|
||||
}
|
||||
return left;
|
||||
}
|
||||
|
||||
|
||||
public Line2D getBottomLine() {
|
||||
|
||||
if (bottom == null) {
|
||||
bottom = new Line2D.Double(getLowerLeft(), getLowerRight());
|
||||
}
|
||||
return bottom;
|
||||
}
|
||||
|
||||
|
||||
public Line2D getTopLine() {
|
||||
|
||||
if (top == null) {
|
||||
top = new Line2D.Double(getTopLeft(), getTopRight());
|
||||
}
|
||||
return top;
|
||||
}
|
||||
|
||||
|
||||
public Point2D getTopLeft() {
|
||||
|
||||
return a;
|
||||
|
||||
}
|
||||
|
||||
|
||||
public Point2D getTopRight() {
|
||||
|
||||
return d;
|
||||
}
|
||||
|
||||
|
||||
public Point2D getLowerRight() {
|
||||
|
||||
return c;
|
||||
}
|
||||
|
||||
|
||||
public Point2D getLowerLeft() {
|
||||
|
||||
return b;
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
|
||||
return String.format("A:(%.2f, %.2f) | B:(%.2f, %.2f) | C:(%.2f, %.2f) | D:(%.2f, %.2f)",
|
||||
getA().getX(),
|
||||
getA().getY(),
|
||||
getB().getX(),
|
||||
getB().getY(),
|
||||
getC().getX(),
|
||||
getC().getY(),
|
||||
getD().getX(),
|
||||
getD().getY());
|
||||
}
|
||||
|
||||
|
||||
public double getAngle() {
|
||||
|
||||
return calculateAngle(a, d);
|
||||
}
|
||||
|
||||
|
||||
private static double calculateAngle(Point2D a, Point2D d) {
|
||||
|
||||
double deltaY = d.getY() - a.getY();
|
||||
double deltaX = d.getX() - a.getX();
|
||||
return Math.atan2(deltaY, deltaX);
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public boolean equals(Object obj) {
|
||||
|
||||
if (obj == this) {
|
||||
return true;
|
||||
}
|
||||
if (obj == null || obj.getClass() != this.getClass()) {
|
||||
return false;
|
||||
}
|
||||
var that = (QuadPoint) obj;
|
||||
return Objects.equals(this.a, that.a) && Objects.equals(this.b, that.b) && Objects.equals(this.c, that.c) && Objects.equals(this.d, that.d);
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public int hashCode() {
|
||||
|
||||
return Objects.hash(a, b, c, d);
|
||||
}
|
||||
|
||||
}
|
||||
@ -30,15 +30,24 @@ public class Ruling extends Line2D.Float {
|
||||
OTHER
|
||||
}
|
||||
|
||||
public enum Style {
|
||||
SOLID,
|
||||
DASHED
|
||||
}
|
||||
|
||||
@Getter
|
||||
@Setter
|
||||
private Classification classification;
|
||||
@Getter
|
||||
@Setter
|
||||
private Style style;
|
||||
|
||||
|
||||
public Ruling(Point2D p1, Point2D p2) {
|
||||
|
||||
super(p1, p2);
|
||||
this.classification = Classification.OTHER;
|
||||
this.style = Style.SOLID;
|
||||
}
|
||||
|
||||
|
||||
|
||||
@ -1,48 +1,48 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.model.table;
|
||||
|
||||
import java.awt.geom.Point2D;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collections;
|
||||
import java.util.Comparator;
|
||||
import java.util.HashSet;
|
||||
import java.util.Collection;
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
import java.util.TreeMap;
|
||||
import java.util.function.Function;
|
||||
import java.util.stream.Collectors;
|
||||
import java.util.stream.Stream;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.BoundingBox;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.PageBlockType;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.Word;
|
||||
|
||||
import lombok.Getter;
|
||||
import lombok.Setter;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
|
||||
@Slf4j
|
||||
@Getter
|
||||
public class TablePageBlock extends AbstractPageBlock {
|
||||
|
||||
public static final double CELL_AREA_CONTAINED_THRESHOLD = 0.98;
|
||||
private final TreeMap<CellPosition, Cell> cellTreeMap = new TreeMap<>();
|
||||
private final TextPageBlock caption;
|
||||
|
||||
private final int rotation;
|
||||
@Getter
|
||||
@Setter
|
||||
private String headline;
|
||||
private int unrotatedRowCount;
|
||||
private int unrotatedColCount;
|
||||
private List<List<Cell>> rows;
|
||||
@Getter
|
||||
@Setter
|
||||
private List<Cell> cells;
|
||||
private final List<List<Cell>> rows;
|
||||
|
||||
|
||||
public TablePageBlock(List<Cell> cells, int rotation) {
|
||||
public TablePageBlock(TextPageBlock caption, List<List<Cell>> rows) {
|
||||
|
||||
setToBBoxOfComponents(cells);
|
||||
this.cells = cells;
|
||||
addCells(cells);
|
||||
classification = PageBlockType.TABLE;
|
||||
this.rotation = rotation;
|
||||
this.classification = PageBlockType.TABLE;
|
||||
this.caption = caption;
|
||||
this.rows = rows;
|
||||
setBBoxes();
|
||||
}
|
||||
|
||||
|
||||
private void setBBoxes() {
|
||||
|
||||
List<BoundingBox> components = Stream.of(getCells().stream(),
|
||||
getCells().stream()
|
||||
.map(Cell::getTextBlocks)
|
||||
.flatMap(Collection::stream))
|
||||
.flatMap(Function.identity())
|
||||
.map(o -> (BoundingBox) o)
|
||||
.toList();
|
||||
setToBBoxOfComponents(components);
|
||||
}
|
||||
|
||||
|
||||
@ -53,28 +53,19 @@ public class TablePageBlock extends AbstractPageBlock {
|
||||
}
|
||||
|
||||
|
||||
public List<List<Cell>> getRows() {
|
||||
|
||||
if (rows == null) {
|
||||
rows = computeRows();
|
||||
|
||||
// Ignore rows that does not contain any cells and values.
|
||||
List<List<Cell>> rowsToRemove = new ArrayList<>();
|
||||
for (List<Cell> row : rows) {
|
||||
if (row.size() == 1 && row.get(0).getTextBlocks().isEmpty()) {
|
||||
rowsToRemove.add(row);
|
||||
}
|
||||
}
|
||||
rows.removeAll(rowsToRemove);
|
||||
|
||||
computeHeaders();
|
||||
}
|
||||
|
||||
return rows;
|
||||
@Override
|
||||
public List<Word> getWords() {
|
||||
|
||||
return getCells().stream()
|
||||
.map(Cell::getTextBlocks)
|
||||
.flatMap(Collection::stream)
|
||||
.map(AbstractPageBlock::getWords)
|
||||
.flatMap(Collection::stream)
|
||||
.toList();
|
||||
}
|
||||
|
||||
|
||||
|
||||
public int getRowCount() {
|
||||
|
||||
return getRows().size();
|
||||
@ -85,259 +76,16 @@ public class TablePageBlock extends AbstractPageBlock {
|
||||
|
||||
return getRows().stream()
|
||||
.mapToInt(List::size)
|
||||
.max()
|
||||
.orElse(0);
|
||||
.max().orElse(0);
|
||||
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Detect header cells (either first row or first column):
|
||||
* Column is marked as header if originalCell text is bold and row originalCell text is not bold.
|
||||
* Defaults to row.
|
||||
*/
|
||||
private void computeHeaders() {
|
||||
|
||||
if (rows == null) {
|
||||
rows = computeRows();
|
||||
}
|
||||
// A bold originalCell is a header originalCell as long as every originalCell to the left/top is bold, too
|
||||
// we move from left to right and top to bottom
|
||||
for (int rowIndex = 0; rowIndex < rows.size(); rowIndex++) {
|
||||
List<Cell> rowCells = rows.get(rowIndex);
|
||||
if (rowCells.size() == 1) {
|
||||
continue;
|
||||
}
|
||||
|
||||
for (int colIndex = 0; colIndex < rowCells.size(); colIndex++) {
|
||||
Cell cell = rowCells.get(colIndex);
|
||||
List<Cell> cellsToTheLeft = rowCells.subList(0, colIndex);
|
||||
Cell lastHeaderCell = null;
|
||||
for (Cell leftCell : cellsToTheLeft) {
|
||||
if (leftCell.isHeaderCell()) {
|
||||
lastHeaderCell = leftCell;
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (lastHeaderCell != null) {
|
||||
cell.getHeaderCells().add(lastHeaderCell);
|
||||
}
|
||||
List<Cell> cellsToTheTop = new ArrayList<>();
|
||||
for (int i = 0; i < rowIndex; i++) {
|
||||
try {
|
||||
cellsToTheTop.add(rows.get(i)
|
||||
.get(colIndex));
|
||||
} catch (IndexOutOfBoundsException e) {
|
||||
log.debug("No cell {} in row {}, ignoring.", colIndex, rowIndex);
|
||||
}
|
||||
}
|
||||
for (Cell topCell : cellsToTheTop) {
|
||||
if (topCell.isHeaderCell()) {
|
||||
lastHeaderCell = topCell;
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (lastHeaderCell != null) {
|
||||
cell.getHeaderCells().add(lastHeaderCell);
|
||||
}
|
||||
if (!cell.getTextBlocks().isEmpty() && cell.getTextBlocks()
|
||||
.get(0).getMostPopularWordStyle().equals("bold")) {
|
||||
cell.setHeaderCell(true);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
private List<List<Cell>> computeRows() {
|
||||
|
||||
List<List<Cell>> rows = new ArrayList<>();
|
||||
if (rotation == 90) {
|
||||
for (int i = 0; i < unrotatedColCount; i++) { // rows
|
||||
List<Cell> lastRow = new ArrayList<>();
|
||||
for (int j = unrotatedRowCount - 1; j >= 0; j--) { // cols
|
||||
Cell cell = cellTreeMap.get(new CellPosition(j, i));
|
||||
if (cell != null) {
|
||||
lastRow.add(cell);
|
||||
}
|
||||
}
|
||||
rows.add(lastRow);
|
||||
}
|
||||
} else if (rotation == 270) {
|
||||
for (int i = unrotatedColCount - 1; i >= 0; i--) { // rows
|
||||
List<Cell> lastRow = new ArrayList<>();
|
||||
for (int j = 0; j < unrotatedRowCount; j++) { // cols
|
||||
Cell cell = cellTreeMap.get(new CellPosition(j, i));
|
||||
if (cell != null) {
|
||||
lastRow.add(cell);
|
||||
}
|
||||
}
|
||||
rows.add(lastRow);
|
||||
}
|
||||
} else {
|
||||
for (int i = 0; i < unrotatedRowCount; i++) {
|
||||
List<Cell> lastRow = new ArrayList<>();
|
||||
for (int j = 0; j < unrotatedColCount; j++) {
|
||||
Cell cell = cellTreeMap.get(new CellPosition(i, j)); // JAVA_8 use getOrDefault()
|
||||
if (cell != null) {
|
||||
lastRow.add(cell);
|
||||
}
|
||||
}
|
||||
rows.add(lastRow);
|
||||
}
|
||||
}
|
||||
|
||||
return rows;
|
||||
|
||||
}
|
||||
|
||||
|
||||
private void addCells(List<Cell> cells) {
|
||||
|
||||
if (cells.isEmpty()) {
|
||||
return;
|
||||
}
|
||||
|
||||
cells.removeIf(cell -> cell.getWidth() < 1.1 || cell.getHeight() < 1.1);
|
||||
|
||||
List<List<Cell>> rowsOfCellsMatrix = calculateTableStructure(cells);
|
||||
|
||||
for (int i = 0; i < rowsOfCellsMatrix.size(); i++) {
|
||||
for (int j = 0; j < rowsOfCellsMatrix.get(i).size(); j++) {
|
||||
addCellToRowAndCol(rowsOfCellsMatrix.get(i)
|
||||
.get(j), i, j);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Calculates the structure of the table. For spanning rows and columns multiple cells with the same values will be inserted.
|
||||
*
|
||||
* @param cells The found cells
|
||||
* @return TablePageBlock Structure as a rows of cells matrix
|
||||
*/
|
||||
private List<List<Cell>> calculateTableStructure(List<Cell> cells) {
|
||||
|
||||
if (cells.isEmpty()) {
|
||||
return new ArrayList<>();
|
||||
}
|
||||
|
||||
Set<Double> uniqueX = new HashSet<>();
|
||||
Set<Double> uniqueY = new HashSet<>();
|
||||
cells.stream()
|
||||
.filter(c -> !c.getTextBlocks().isEmpty() || c.getHeight() > 3 && c.getWidth() > 3)
|
||||
.forEach(c -> {
|
||||
uniqueX.add(c.getPdfMinX());
|
||||
uniqueX.add(c.getPdfMaxX());
|
||||
uniqueY.add(c.getPdfMinY());
|
||||
uniqueY.add(c.getPdfMaxY());
|
||||
});
|
||||
|
||||
var sortedUniqueX = uniqueX.stream()
|
||||
.sorted()
|
||||
.toList();
|
||||
var sortedUniqueY = uniqueY.stream()
|
||||
.sorted()
|
||||
.toList();
|
||||
|
||||
List<List<Cell>> rowsOfCells = new ArrayList<>();
|
||||
|
||||
Double prevY = null;
|
||||
|
||||
for (Double y : sortedUniqueY) {
|
||||
|
||||
List<Cell> row = new ArrayList<>();
|
||||
|
||||
Double prevX = null;
|
||||
for (Double x : sortedUniqueX) {
|
||||
|
||||
if (prevY != null && prevX != null) {
|
||||
var cellFromGridStructure = new Cell(new Point2D.Double(prevX, prevY), new Point2D.Double(x, y));
|
||||
|
||||
if (cellFromGridStructure.hasMinimumSize()) {
|
||||
|
||||
cells.stream()
|
||||
.map(originalCell -> new CellWithIntersection(originalCell,
|
||||
RectangleTransformations.calculateIntersectedArea(cellFromGridStructure.getBBoxPdf(),
|
||||
originalCell.getBBoxPdf())))
|
||||
.filter(cellWithIntersection -> cellWithIntersection.intersectedArea > 0)
|
||||
.filter(cellWithIntersection -> cellWithIntersection.originalCell.getArea() > cellWithIntersection.intersectedArea * CELL_AREA_CONTAINED_THRESHOLD)
|
||||
.max(Comparator.comparing(CellWithIntersection::intersectedArea))
|
||||
.map(CellWithIntersection::originalCell)
|
||||
.ifPresent(matchingCell -> cellFromGridStructure.getTextBlocks().addAll(matchingCell.getTextBlocks()));
|
||||
|
||||
row.add(cellFromGridStructure);
|
||||
}
|
||||
}
|
||||
prevX = x;
|
||||
}
|
||||
|
||||
// exclude empty rows and rows where all text blocks are empty
|
||||
if (prevY != null && prevX != null && !row.isEmpty() && !row.stream()
|
||||
.allMatch(cell -> cell.getTextBlocks().isEmpty())) {
|
||||
|
||||
rowsOfCells.add(row);
|
||||
}
|
||||
prevY = y;
|
||||
}
|
||||
|
||||
Collections.reverse(rowsOfCells);
|
||||
|
||||
// now cells are removed which are part of a column without any text blocks
|
||||
// this is done by first computing the inverse matrix which contains call columns of cells
|
||||
// then the column indices that have to be removed are determined
|
||||
List<List<Cell>> columnsOfCells = new ArrayList<>();
|
||||
int maxRowLength = rowsOfCells.stream()
|
||||
.map(List::size)
|
||||
.max(java.util.Comparator.naturalOrder())
|
||||
.orElse(0);
|
||||
for (int i = 0; i < maxRowLength; i++) {
|
||||
columnsOfCells.add(new ArrayList<>());
|
||||
}
|
||||
|
||||
for (List<Cell> row : rowsOfCells) {
|
||||
for (int j = 0; j < row.size(); j++) {
|
||||
columnsOfCells.get(j).add(row.get(j));
|
||||
}
|
||||
}
|
||||
|
||||
List<Integer> columnIndicesToRemove = new ArrayList<>();
|
||||
int columnIndex = 0;
|
||||
for (List<Cell> col : columnsOfCells) {
|
||||
if (col.stream()
|
||||
.allMatch(cell -> cell.getTextBlocks().isEmpty())) {
|
||||
columnIndicesToRemove.add(columnIndex);
|
||||
}
|
||||
columnIndex++;
|
||||
}
|
||||
columnIndicesToRemove.sort(Collections.reverseOrder());
|
||||
|
||||
// update all rows so that the values of the empty columns get removed
|
||||
var rowsOfCellsBefore = new ArrayList<>(rowsOfCells);
|
||||
rowsOfCells = new ArrayList<>();
|
||||
for (List<Cell> row : rowsOfCellsBefore) {
|
||||
var updatedRow = new ArrayList<>(row);
|
||||
columnIndicesToRemove.forEach(idxToRemove -> updatedRow.remove(updatedRow.get(idxToRemove)));
|
||||
rowsOfCells.add(updatedRow);
|
||||
}
|
||||
|
||||
return rowsOfCells;
|
||||
}
|
||||
|
||||
|
||||
private void addCellToRowAndCol(Cell cell, int row, int col) {
|
||||
|
||||
unrotatedRowCount = Math.max(unrotatedRowCount, row + 1);
|
||||
unrotatedColCount = Math.max(unrotatedColCount, col + 1);
|
||||
|
||||
CellPosition cp = new CellPosition(row, col);
|
||||
cellTreeMap.put(cp, cell);
|
||||
public List<Cell> getCells() {
|
||||
|
||||
return getRows().stream()
|
||||
.flatMap(List::stream)
|
||||
.collect(Collectors.toList());
|
||||
}
|
||||
|
||||
|
||||
@ -360,7 +108,7 @@ public class TablePageBlock extends AbstractPageBlock {
|
||||
}
|
||||
if (column != null && column.getTextBlocks() != null) {
|
||||
boolean first = true;
|
||||
for (TextPageBlock textBlock : column.getTextBlocks()) {
|
||||
for (AbstractPageBlock textBlock : column.getTextBlocks()) {
|
||||
if (!first) {
|
||||
sb.append("\n");
|
||||
}
|
||||
@ -392,7 +140,7 @@ public class TablePageBlock extends AbstractPageBlock {
|
||||
sb.append(i == 0 ? "\n<th>" : "\n<td>");
|
||||
if (column != null && column.getTextBlocks() != null) {
|
||||
boolean first = true;
|
||||
for (TextPageBlock textBlock : column.getTextBlocks()) {
|
||||
for (AbstractPageBlock textBlock : column.getTextBlocks()) {
|
||||
if (!first) {
|
||||
sb.append("<br />");
|
||||
}
|
||||
@ -411,9 +159,4 @@ public class TablePageBlock extends AbstractPageBlock {
|
||||
return sb.toString();
|
||||
}
|
||||
|
||||
|
||||
record CellWithIntersection(Cell originalCell, double intersectedArea) {
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
@ -1,9 +1,7 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.model;
|
||||
package com.knecon.fforesight.service.layoutparser.processor.model.text;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
|
||||
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Data;
|
||||
|
||||
@ -1,9 +1,7 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.model;
|
||||
package com.knecon.fforesight.service.layoutparser.processor.model.text;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
|
||||
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Data;
|
||||
|
||||
@ -7,8 +7,7 @@ import org.apache.pdfbox.text.TextPosition;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.TextBoundingBox;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.docstrum.utils.FastAtan2;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.CoordinateTransforms;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.PageInformation;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations;
|
||||
|
||||
import lombok.AccessLevel;
|
||||
import lombok.AllArgsConstructor;
|
||||
@ -65,7 +64,7 @@ public class RedTextPosition extends TextBoundingBox {
|
||||
pos.setBBoxDirAdj(dirAdjPosition);
|
||||
|
||||
AffineTransform affineTransform = getRotationMatrix(TextDirection.fromDegrees(textPosition.getDir()), textPosition.getPageWidth(), textPosition.getPageHeight());
|
||||
Rectangle2D bBoxInitialUserSpace = affineTransform.createTransformedShape(dirAdjPosition).getBounds2D();
|
||||
Rectangle2D bBoxInitialUserSpace = RectangleTransformations.transform(dirAdjPosition, affineTransform);
|
||||
|
||||
pos.setBBoxPdf(bBoxInitialUserSpace); // These are definitely correct
|
||||
|
||||
|
||||
@ -2,47 +2,62 @@ package com.knecon.fforesight.service.layoutparser.processor.model.text;
|
||||
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collection;
|
||||
import java.util.Collections;
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import com.fasterxml.jackson.annotation.JsonIgnore;
|
||||
import com.iqser.red.service.redaction.v1.server.model.document.nodes.LayoutEngine;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.Orientation;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.PageBlockType;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.TextNormalizationUtilities;
|
||||
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Builder;
|
||||
import lombok.Data;
|
||||
import lombok.EqualsAndHashCode;
|
||||
import lombok.NoArgsConstructor;
|
||||
|
||||
@EqualsAndHashCode(callSuper = true)
|
||||
@Data
|
||||
@AllArgsConstructor
|
||||
@Builder
|
||||
@NoArgsConstructor
|
||||
public class TextPageBlock extends AbstractPageBlock {
|
||||
|
||||
@Builder.Default
|
||||
@EqualsAndHashCode.Exclude
|
||||
private List<Word> words = new ArrayList<>();
|
||||
@Builder.Default
|
||||
@EqualsAndHashCode.Exclude
|
||||
private FrequencyCounters frequencyCounters = new FrequencyCounters();
|
||||
|
||||
private Rectangle2D bBoxDirAdj;
|
||||
|
||||
private boolean underlined;
|
||||
|
||||
private PageBlockType classification;
|
||||
|
||||
private boolean toDuplicate;
|
||||
|
||||
@EqualsAndHashCode.Exclude
|
||||
private String text;
|
||||
private boolean changed;
|
||||
|
||||
|
||||
public TextPageBlock(List<Word> words, int page, PageBlockType classification, Set<LayoutEngine> engines, Orientation orientation) {
|
||||
|
||||
this.page = page;
|
||||
this.classification = classification;
|
||||
this.engines = engines;
|
||||
this.orientation = orientation;
|
||||
setDefaultFields(words);
|
||||
}
|
||||
|
||||
|
||||
public TextPageBlock(List<Word> words) {
|
||||
|
||||
setDefaultFields(words);
|
||||
}
|
||||
|
||||
|
||||
private void setDefaultFields(List<Word> words) {
|
||||
|
||||
this.words = new ArrayList<>(words);
|
||||
this.frequencyCounters = new FrequencyCounters();
|
||||
|
||||
@ -73,10 +88,6 @@ public class TextPageBlock extends AbstractPageBlock {
|
||||
this.bBoxDirAdj = new Rectangle2D.Double();
|
||||
return;
|
||||
}
|
||||
this.bBoxDirAdj = words.stream()
|
||||
.map(Word::getBBoxDirAdj)
|
||||
.collect(RectangleTransformations.collectBBox());
|
||||
|
||||
setToBBoxOfComponents(words);
|
||||
}
|
||||
|
||||
@ -87,7 +98,7 @@ public class TextPageBlock extends AbstractPageBlock {
|
||||
}
|
||||
|
||||
|
||||
public static TextPageBlock merge(List<TextPageBlock> textBlocksToMerge) {
|
||||
public static TextPageBlock merge(Collection<TextPageBlock> textBlocksToMerge) {
|
||||
|
||||
if (textBlocksToMerge.isEmpty()) {
|
||||
throw new IllegalArgumentException("Need to provide at least one TextPageBlock.");
|
||||
@ -98,14 +109,33 @@ public class TextPageBlock extends AbstractPageBlock {
|
||||
.count() != 1) {
|
||||
throw new IllegalArgumentException("Cannot merge textBlocks on different pages.");
|
||||
}
|
||||
if (textBlocksToMerge.stream()
|
||||
.map(AbstractPageBlock::getClassification)
|
||||
.distinct()
|
||||
.count() != 1) {
|
||||
throw new IllegalArgumentException("Cannot merge textBlocks of different types.");
|
||||
}
|
||||
if (textBlocksToMerge.stream()
|
||||
.map(AbstractPageBlock::getDir)
|
||||
.distinct()
|
||||
.count() != 1) {
|
||||
throw new IllegalArgumentException("Cannot merge textBlocks of different directions.");
|
||||
}
|
||||
|
||||
List<Word> sequences = textBlocksToMerge.stream()
|
||||
.map(TextPageBlock::getWords)
|
||||
.flatMap(java.util.Collection::stream)
|
||||
.toList();
|
||||
sequences = new ArrayList<>(sequences);
|
||||
.flatMap(Collection::stream)
|
||||
.collect(Collectors.toList());
|
||||
|
||||
return new TextPageBlock(sequences);
|
||||
TextPageBlock first = textBlocksToMerge.iterator().next();
|
||||
return new TextPageBlock(sequences,
|
||||
first.getPage(),
|
||||
first.getClassification(),
|
||||
textBlocksToMerge.stream()
|
||||
.map(AbstractPageBlock::getEngines)
|
||||
.flatMap(Collection::stream)
|
||||
.collect(Collectors.toSet()),
|
||||
Orientation.NONE);
|
||||
}
|
||||
|
||||
|
||||
@ -172,6 +202,14 @@ public class TextPageBlock extends AbstractPageBlock {
|
||||
}
|
||||
|
||||
|
||||
public void removeAll(List<Word> words) {
|
||||
|
||||
changed = true;
|
||||
this.words.removeAll(words);
|
||||
setDefaultFields(this.words);
|
||||
}
|
||||
|
||||
|
||||
public TextPageBlock copy() {
|
||||
|
||||
return new TextPageBlock(new ArrayList<>(words));
|
||||
|
||||
@ -15,6 +15,7 @@ import org.apache.pdfbox.text.TextPosition;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.Character;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.TextBoundingBox;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations;
|
||||
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Builder;
|
||||
@ -66,9 +67,9 @@ public class Word extends TextBoundingBox implements CharSequence {
|
||||
}
|
||||
|
||||
|
||||
public Word(List<Character> textPositions, int page) {
|
||||
public Word(List<Character> characters, int page) {
|
||||
|
||||
this.characters = new ArrayList<>(textPositions);
|
||||
this.characters = new ArrayList<>(characters);
|
||||
this.page = page;
|
||||
calculateBBoxAndHashcode();
|
||||
}
|
||||
@ -101,12 +102,12 @@ public class Word extends TextBoundingBox implements CharSequence {
|
||||
@Override
|
||||
public Word subSequence(int start, int end) {
|
||||
|
||||
var textPositionSequence = new Word();
|
||||
textPositionSequence.characters = characters.subList(start, end);
|
||||
textPositionSequence.page = page;
|
||||
textPositionSequence.dir = dir;
|
||||
textPositionSequence.setToBBoxOfComponents(getTextPositions());
|
||||
return textPositionSequence;
|
||||
var word = new Word();
|
||||
word.characters = characters.subList(start, end);
|
||||
word.page = page;
|
||||
word.dir = dir;
|
||||
word.setToBBoxOfComponents(getTextPositions());
|
||||
return word;
|
||||
}
|
||||
|
||||
|
||||
@ -262,7 +263,7 @@ public class Word extends TextBoundingBox implements CharSequence {
|
||||
public void transform(AffineTransform rotateInstance) {
|
||||
|
||||
for (RedTextPosition textPosition : getTextPositions()) {
|
||||
Rectangle2D exactDirAdjCoordinates = rotateInstance.createTransformedShape(textPosition.getBBoxDirAdj()).getBounds2D();
|
||||
Rectangle2D exactDirAdjCoordinates = RectangleTransformations.transform(textPosition.getBBoxDirAdj(), rotateInstance);
|
||||
textPosition.setBBoxDirAdj(exactDirAdjCoordinates);
|
||||
}
|
||||
calculateBBoxAndHashcode();
|
||||
|
||||
@ -13,7 +13,10 @@ import com.iqser.red.service.redaction.v1.server.model.document.nodes.ImageType;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.image.ClassifiedImage;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.image.ImageMetadata;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.image.ImageServiceResponse;
|
||||
import com.knecon.fforesight.service.ocr.v1.api.model.Figure;
|
||||
import com.knecon.fforesight.service.ocr.v1.api.model.IdpResult;
|
||||
|
||||
import lombok.RequiredArgsConstructor;
|
||||
|
||||
@ -21,48 +24,78 @@ import lombok.RequiredArgsConstructor;
|
||||
@RequiredArgsConstructor
|
||||
public class ImageServiceResponseAdapter {
|
||||
|
||||
public Map<Integer, List<ClassifiedImage>> buildClassifiedImagesPerPage(ImageServiceResponse imageServiceResponse) {
|
||||
public Map<Integer, List<ClassifiedImage>> buildClassifiedImagesPerPage(ImageServiceResponse imageServiceResponse, IdpResult idpResult) {
|
||||
|
||||
Map<Integer, List<ClassifiedImage>> images = new HashMap<>();
|
||||
imageServiceResponse.getData().forEach(imageMetadata -> {
|
||||
var classification = imageMetadata.getFilters().isAllPassed() ? ImageType.valueOf(imageMetadata.getClassification()
|
||||
.getLabel()
|
||||
.toUpperCase(Locale.ROOT)) : ImageType.OTHER;
|
||||
images.computeIfAbsent(imageMetadata.getPosition().getPageNumber(), x -> new ArrayList<>())
|
||||
.add(new ClassifiedImage(new Rectangle2D.Double(imageMetadata.getPosition().getX1(),
|
||||
imageMetadata.getPosition().getY1(),
|
||||
imageMetadata.getGeometry().getWidth(),
|
||||
imageMetadata.getGeometry().getHeight()), classification, imageMetadata.isAlpha(), imageMetadata.getPosition().getPageNumber(),imageMetadata.getRepresentation()));
|
||||
});
|
||||
|
||||
// Currently This is a copy but, it will be changed later because i don' t think that we should unclassified images.
|
||||
imageServiceResponse.getDataCV().forEach(imageMetadata -> {
|
||||
var classification = imageMetadata.getFilters().isAllPassed() ? ImageType.valueOf(imageMetadata.getClassification()
|
||||
.getLabel()
|
||||
.toUpperCase(Locale.ROOT)) : ImageType.OTHER;
|
||||
images.computeIfAbsent(imageMetadata.getPosition().getPageNumber(), x -> new ArrayList<>())
|
||||
.add(new ClassifiedImage(new Rectangle2D.Double(imageMetadata.getPosition().getX1(),
|
||||
imageMetadata.getPosition().getY1(),
|
||||
imageMetadata.getGeometry().getWidth(),
|
||||
imageMetadata.getGeometry().getHeight()), classification, imageMetadata.isAlpha(), imageMetadata.getPosition().getPageNumber(),imageMetadata.getRepresentation()));
|
||||
});
|
||||
imageServiceResponse.getData()
|
||||
.forEach(imageMetadata -> addImageMetaData(imageMetadata, images));
|
||||
imageServiceResponse.getDataCV()
|
||||
.forEach(imageMetadata -> addImageMetaData(imageMetadata, images));
|
||||
idpResult.figures()
|
||||
.forEach(figure -> addFigure(figure, images));
|
||||
|
||||
return images;
|
||||
}
|
||||
|
||||
|
||||
private static void addFigure(Figure figure, Map<Integer, List<ClassifiedImage>> images) {
|
||||
|
||||
var classification = ImageType.GRAPHIC;
|
||||
ClassifiedImage image = new ClassifiedImage(figure.image().bbox().get().getBounds2D(), classification, false, figure.image().pageNumber(), "");
|
||||
getImagesOnPage(figure.image().pageNumber(), images).add(image);
|
||||
}
|
||||
|
||||
|
||||
private static void addImageMetaData(ImageMetadata imageMetadata, Map<Integer, List<ClassifiedImage>> images) {
|
||||
|
||||
var image = new ClassifiedImage(getPosition(imageMetadata),
|
||||
getImageType(imageMetadata),
|
||||
imageMetadata.isAlpha(),
|
||||
imageMetadata.getPosition().getPageNumber(),
|
||||
imageMetadata.getRepresentation());
|
||||
getImagesOnPage(imageMetadata.getPosition().getPageNumber(), images).add(image);
|
||||
}
|
||||
|
||||
|
||||
private static Rectangle2D.Double getPosition(ImageMetadata imageMetadata) {
|
||||
|
||||
return new Rectangle2D.Double(imageMetadata.getPosition().getX1(),
|
||||
imageMetadata.getPosition().getY1(),
|
||||
imageMetadata.getGeometry().getWidth(),
|
||||
imageMetadata.getGeometry().getHeight());
|
||||
}
|
||||
|
||||
|
||||
private static ImageType getImageType(ImageMetadata imageMetadata) {
|
||||
|
||||
if (imageMetadata.getFilters().isAllPassed()) {
|
||||
return ImageType.valueOf(imageMetadata.getClassification().getLabel().toUpperCase(Locale.ROOT));
|
||||
} else {
|
||||
return ImageType.OTHER;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
private static List<ClassifiedImage> getImagesOnPage(int pageNumber, Map<Integer, List<ClassifiedImage>> images) {
|
||||
|
||||
return images.computeIfAbsent(pageNumber, x -> new ArrayList<>());
|
||||
}
|
||||
|
||||
|
||||
public void findOcr(ClassificationPage classificationPage) {
|
||||
|
||||
classificationPage.getImages().forEach(image -> {
|
||||
if (image.getImageType().equals(ImageType.OTHER)) {
|
||||
for (AbstractPageBlock textblock : classificationPage.getTextBlocks()) {
|
||||
if (image.getPosition().contains(textblock.getBBoxPdf())) {
|
||||
image.setImageType(ImageType.OCR);
|
||||
return;
|
||||
classificationPage.getImages()
|
||||
.forEach(image -> {
|
||||
if (image.getImageType().equals(ImageType.OTHER)) {
|
||||
for (AbstractPageBlock textblock : classificationPage.getTextBlocks()) {
|
||||
if (image.getPosition().contains(textblock.getBBoxPdf())) {
|
||||
image.setImageType(ImageType.OCR);
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@ -183,7 +183,7 @@ public class BodyTextFrameService {
|
||||
if (cell == null || cell.getTextBlocks() == null) {
|
||||
continue;
|
||||
}
|
||||
for (TextPageBlock textBlock : cell.getTextBlocks()) {
|
||||
for (AbstractPageBlock textBlock : cell.getTextBlocks()) {
|
||||
expandRectangle(textBlock, page, expansionsRectangle);
|
||||
}
|
||||
}
|
||||
@ -198,7 +198,7 @@ public class BodyTextFrameService {
|
||||
}
|
||||
|
||||
|
||||
private void expandRectangle(TextPageBlock textBlock, ClassificationPage page, BodyTextFrameExpansionsRectangle expansionsRectangle) {
|
||||
private void expandRectangle(AbstractPageBlock textBlock, ClassificationPage page, BodyTextFrameExpansionsRectangle expansionsRectangle) {
|
||||
|
||||
if (page.getPageWidth() > page.getPageHeight() && page.getRotation() != 0) {
|
||||
if (textBlock.getPdfMinY() < expansionsRectangle.minX) {
|
||||
|
||||
@ -1,25 +0,0 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.services;
|
||||
|
||||
import java.awt.geom.Rectangle2D;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.LineInformation;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations;
|
||||
|
||||
import lombok.experimental.UtilityClass;
|
||||
|
||||
@UtilityClass
|
||||
public class MainBodyTextFrameExtractionService {
|
||||
|
||||
private static final double TEXT_FRAME_PAD_WIDTH = 0.0;
|
||||
private static final double TEXT_FRAME_PAD_HEIGHT = 0.02;
|
||||
|
||||
|
||||
public Rectangle2D calculateMainBodyTextFrame(LineInformation lineInformation) {
|
||||
|
||||
Rectangle2D mainBodyTextFrame = lineInformation.getLineBBox().stream()
|
||||
.collect(RectangleTransformations.collectBBox());
|
||||
|
||||
return RectangleTransformations.pad(mainBodyTextFrame, mainBodyTextFrame.getWidth() * TEXT_FRAME_PAD_WIDTH, mainBodyTextFrame.getHeight() * TEXT_FRAME_PAD_HEIGHT);
|
||||
}
|
||||
|
||||
}
|
||||
@ -1,73 +1,229 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.services;
|
||||
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
import java.util.Collection;
|
||||
import java.util.LinkedList;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.Iterator;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.concurrent.CountDownLatch;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
import java.util.concurrent.TimeoutException;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import org.apache.pdfbox.Loader;
|
||||
import org.apache.pdfbox.pdmodel.PDDocument;
|
||||
import org.apache.pdfbox.pdmodel.PDPage;
|
||||
import org.springframework.core.io.ClassPathResource;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.BoundingBox;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.PageContents;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.PageInformation;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.Word;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.graphics.Box;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.graphics.FindGraphicsRaster;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.graphics.GraphicBBDetector;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.parsing.PDFLinesTextStripper;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations;
|
||||
|
||||
import lombok.experimental.UtilityClass;
|
||||
import lombok.AccessLevel;
|
||||
import lombok.Getter;
|
||||
import lombok.SneakyThrows;
|
||||
import lombok.experimental.FieldDefaults;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
|
||||
@UtilityClass
|
||||
@Slf4j
|
||||
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
|
||||
public class PageContentExtractor {
|
||||
|
||||
public List<PageContents> getSortedPageContents(String filename) throws IOException {
|
||||
static boolean USE_IMAGE_BASED_GRAPHIC_DETECTION;
|
||||
@Getter
|
||||
int pageCount;
|
||||
@Getter
|
||||
File document;
|
||||
|
||||
List<PageContents> textPositionSequencesPerPage = new LinkedList<>();
|
||||
ClassPathResource pdfResource = new ClassPathResource(filename);
|
||||
PageContents[] pageContents;
|
||||
CountDownLatch[] finishedLookup;
|
||||
List<List<Integer>> pageNumberBatches;
|
||||
|
||||
try (PDDocument pdDocument = Loader.loadPDF(pdfResource.getFile())) {
|
||||
|
||||
for (int pageNumber = 1; pageNumber < pdDocument.getNumberOfPages() + 1; pageNumber++) {
|
||||
public PageContentExtractor(File document, int threads) {
|
||||
|
||||
PDFLinesTextStripper stripper = new PDFLinesTextStripper();
|
||||
PDPage pdPage = pdDocument.getPage(pageNumber - 1);
|
||||
stripper.setPageNumber(pageNumber);
|
||||
stripper.setSortByPosition(true);
|
||||
stripper.setStartPage(pageNumber);
|
||||
stripper.setEndPage(pageNumber);
|
||||
stripper.setPdpage(pdPage);
|
||||
stripper.getText(pdDocument);
|
||||
|
||||
Map<Float, List<Word>> sortedTextPositionSequencesPerDir = stripper.getWords()
|
||||
.stream()
|
||||
.collect(Collectors.groupingBy(textPositionSequence -> textPositionSequence.getDir().getDegrees()));
|
||||
|
||||
var sortedTextPositionSequences = sortByDirAccordingToPageRotation(sortedTextPositionSequencesPerDir, pdPage.getRotation());
|
||||
|
||||
textPositionSequencesPerPage.add(new PageContents(sortedTextPositionSequences,
|
||||
RectangleTransformations.toRectangle2D(pdPage.getCropBox()),
|
||||
RectangleTransformations.toRectangle2D(pdPage.getMediaBox()),
|
||||
stripper.getRulings()));
|
||||
}
|
||||
this.document = document;
|
||||
this.pageCount = getPageCount(document);
|
||||
this.pageContents = new PageContents[pageCount];
|
||||
this.finishedLookup = new CountDownLatch[pageCount];
|
||||
for (int i = 0; i < pageCount; i++) {
|
||||
this.finishedLookup[i] = new CountDownLatch(1);
|
||||
}
|
||||
int actualThreads = Math.min(pageCount, threads);
|
||||
pageNumberBatches = new ArrayList<>(actualThreads);
|
||||
for (int i = 0; i < actualThreads; i++) {
|
||||
pageNumberBatches.add(new ArrayList<>(pageCount / actualThreads));
|
||||
}
|
||||
for (int i = 1; i <= pageCount; i++) {
|
||||
pageNumberBatches.get(i % actualThreads).add(i);
|
||||
}
|
||||
|
||||
return textPositionSequencesPerPage;
|
||||
}
|
||||
|
||||
|
||||
public List<Word> sortByDirAccordingToPageRotation(Map<Float, List<Word>> sortedTextPositionSequencesPerDir, int rotation) {
|
||||
@SneakyThrows
|
||||
private int getPageCount(File document) {
|
||||
|
||||
LinkedList<Float> sortedKeys = new LinkedList<>(sortedTextPositionSequencesPerDir.keySet().stream().sorted().toList());
|
||||
|
||||
for (int i = 0; i < sortedKeys.size(); i++) {
|
||||
if (sortedKeys.get(i) < rotation) {
|
||||
Float keyToSwap = sortedKeys.remove(i);
|
||||
sortedKeys.addLast(keyToSwap);
|
||||
}
|
||||
try (var doc = openDocument(document)) {
|
||||
return doc.getNumberOfPages();
|
||||
}
|
||||
return sortedKeys.stream().map(sortedTextPositionSequencesPerDir::get).flatMap(Collection::stream).toList();
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
public void startAsync() {
|
||||
|
||||
List<Thread> extractionThreads = new ArrayList<>(pageNumberBatches.size());
|
||||
for (List<Integer> pageNumberBatch : pageNumberBatches) {
|
||||
Thread thread = new Thread(() -> extractPages(pageNumberBatch));
|
||||
thread.start();
|
||||
extractionThreads.add(thread);
|
||||
}
|
||||
Thread finisher = new Thread(() -> {
|
||||
awaitFinished(extractionThreads);
|
||||
});
|
||||
finisher.start();
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
private static void awaitFinished(List<Thread> extractionThreads) {
|
||||
|
||||
for (Thread extractionThread : extractionThreads) {
|
||||
extractionThread.join();
|
||||
}
|
||||
log.info("Page content extraction threads finished!");
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
private void extractPages(List<Integer> pageNumbers) {
|
||||
|
||||
var doc = openDocument(document);
|
||||
int count = 0;
|
||||
var pageGetter = new PageGetter(doc.getPages()
|
||||
.iterator(), pageCount);
|
||||
for (Integer pageNumber : pageNumbers) {
|
||||
count++;
|
||||
if (count % 100 == 0) {
|
||||
// As PDFBox caches all types of stuff, we need to close the document every once in a while to save on RAM
|
||||
doc.close();
|
||||
doc = openDocument(document);
|
||||
}
|
||||
|
||||
extractPage(pageNumber, doc, pageGetter.getPage(pageNumber));
|
||||
}
|
||||
doc.close();
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
private PDDocument openDocument(File originFile) {
|
||||
|
||||
PDDocument document = Loader.loadPDF(originFile);
|
||||
document.setAllSecurityToBeRemoved(true);
|
||||
return document;
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
public void extractPage(Integer pageNumber, PDDocument doc, PDPage pdPage) {
|
||||
|
||||
PDFLinesTextStripper stripper = new PDFLinesTextStripper();
|
||||
stripper.setPageNumber(pageNumber);
|
||||
stripper.setStartPage(pageNumber);
|
||||
stripper.setEndPage(pageNumber);
|
||||
stripper.setPdpage(pdPage);
|
||||
stripper.getText(doc);
|
||||
|
||||
PageInformation pageInformation = PageInformation.fromPDPage(pageNumber, pdPage);
|
||||
List<Word> words = stripper.getWords();
|
||||
List<Ruling> rulings = stripper.getRulings();
|
||||
List<Box> graphicBBoxes = findGraphicBBoxes(pageInformation, pdPage, doc, words);
|
||||
|
||||
pageContents[pageNumber - 1] = new PageContents(pageInformation, words, rulings, graphicBBoxes);
|
||||
finishedLookup[pageNumber - 1].countDown();
|
||||
}
|
||||
|
||||
|
||||
private static List<Box> findGraphicBBoxes(PageInformation pageInformation, PDPage pdPage, PDDocument doc, List<Word> words) throws IOException {
|
||||
|
||||
GraphicBBDetector graphicBBDetector = new GraphicBBDetector(pdPage, true);
|
||||
List<Box> graphicBBoxes = graphicBBDetector.findGraphicBB();
|
||||
|
||||
if (USE_IMAGE_BASED_GRAPHIC_DETECTION) {
|
||||
// This should only be used if ocr was performed, it is currently in an early stage and needs to be improved.
|
||||
List<Rectangle2D> wordIgnoreZones = words.stream()
|
||||
.map(BoundingBox::getBBoxPdf)
|
||||
.map(box -> RectangleTransformations.pad(box, 2, 2))
|
||||
.collect(Collectors.toList());
|
||||
graphicBBoxes.addAll(FindGraphicsRaster.findCCBoundingBoxes(doc, wordIgnoreZones, pageInformation));
|
||||
}
|
||||
return graphicBBoxes;
|
||||
}
|
||||
|
||||
|
||||
public PageContents awaitPageContents(Integer pageNumber) throws InterruptedException, TimeoutException {
|
||||
|
||||
if (finishedLookup[pageNumber - 1].await(1, TimeUnit.MINUTES)) {
|
||||
return pageContents[pageNumber - 1];
|
||||
}
|
||||
throw new TimeoutException("A timeout has occurred during page content extraction!");
|
||||
}
|
||||
|
||||
|
||||
public List<PageContents> awaitAllContents() throws InterruptedException {
|
||||
|
||||
for (CountDownLatch countDownLatch : finishedLookup) {
|
||||
countDownLatch.await();
|
||||
}
|
||||
return Arrays.asList(pageContents);
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
public static List<PageContents> getDocumentContents(File document, int threads) {
|
||||
|
||||
PageContentExtractor extractor = new PageContentExtractor(document, threads);
|
||||
extractor.startAsync();
|
||||
return extractor.awaitAllContents();
|
||||
}
|
||||
|
||||
|
||||
private static class PageGetter {
|
||||
|
||||
Iterator<PDPage> pageIterator;
|
||||
int current;
|
||||
int max;
|
||||
|
||||
|
||||
PageGetter(Iterator<PDPage> pageIterator, int max) {
|
||||
|
||||
this.pageIterator = pageIterator;
|
||||
this.max = max;
|
||||
this.current = 0;
|
||||
}
|
||||
|
||||
|
||||
public PDPage getPage(int pageNumber) {
|
||||
|
||||
assert pageNumber >= current && pageNumber <= max;
|
||||
int pagesToIterate = pageNumber - current;
|
||||
PDPage page = null;
|
||||
for (int i = 0; i < pagesToIterate; i++) {
|
||||
page = pageIterator.next();
|
||||
}
|
||||
current = pageNumber;
|
||||
return page;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@ -1,24 +0,0 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.services;
|
||||
|
||||
import java.awt.geom.Rectangle2D;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.GapInformation;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.LineInformation;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.PageContents;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.PageInformation;
|
||||
|
||||
import lombok.experimental.UtilityClass;
|
||||
|
||||
@UtilityClass
|
||||
public class PageInformationService {
|
||||
|
||||
public PageInformation build(PageContents pageContents) {
|
||||
|
||||
LineInformation lineInformation = LineDetectionService.calculateLineInformation(pageContents.getSortedWords());
|
||||
Rectangle2D mainBodyTextFrame = MainBodyTextFrameExtractionService.calculateMainBodyTextFrame(lineInformation);
|
||||
GapInformation gapInformation = GapDetectionService.findGapsInLines(pageContents.getSortedWords(), mainBodyTextFrame);
|
||||
|
||||
return new PageInformation(pageContents, lineInformation, mainBodyTextFrame, gapInformation);
|
||||
}
|
||||
|
||||
}
|
||||
@ -5,18 +5,20 @@ import static com.knecon.fforesight.service.layoutparser.processor.utils.Geometr
|
||||
import java.awt.geom.Point2D;
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashMap;
|
||||
import java.util.Collection;
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Objects;
|
||||
import java.util.Set;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.UnionFind;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableCells;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.UnionFind;
|
||||
|
||||
import lombok.RequiredArgsConstructor;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
@ -52,22 +54,22 @@ public class RulingCleaningService {
|
||||
|
||||
private Rulings cleanRulings(Rulings rulings) {
|
||||
|
||||
List<List<Rectangle2D>> groupedOverlappingVerticalRectangles = groupOverlappingRectangles(rulings.verticalLines.stream()
|
||||
.map(RulingCleaningService::getOverlapRectangle)
|
||||
.distinct()
|
||||
.toList());
|
||||
var groupedOverlappingVerticalRectangles = groupOverlappingRectangles(rulings.verticalLines.stream()
|
||||
.map(RulingCleaningService::getOverlapRectangle)
|
||||
.distinct()
|
||||
.toList());
|
||||
List<Ruling> cleanedVerticalRulings = groupedOverlappingVerticalRectangles.stream()
|
||||
.map(rectList -> getXCenteredRuling(RectangleTransformations.rectangle2DBBox(rectList)))
|
||||
.map(RulingCleaningService::getXCenteredRuling)
|
||||
.filter(ruling -> ruling.length() > 0)
|
||||
.toList();
|
||||
|
||||
List<List<Rectangle2D>> groupedOverlappingHorizontalRectangles = groupOverlappingRectangles(rulings.horizontalLines.stream()
|
||||
.map(RulingCleaningService::getOverlapRectangle)
|
||||
.distinct()
|
||||
.toList());
|
||||
var groupedOverlappingHorizontalRectangles = groupOverlappingRectangles(rulings.horizontalLines.stream()
|
||||
.map(RulingCleaningService::getOverlapRectangle)
|
||||
.distinct()
|
||||
.toList());
|
||||
|
||||
List<Ruling> cleanedHorizontalRulings = groupedOverlappingHorizontalRectangles.stream()
|
||||
.map(rectList -> getYCenteredRuling(RectangleTransformations.rectangle2DBBox(rectList)))
|
||||
.map(RulingCleaningService::getYCenteredRuling)
|
||||
.filter(ruling -> ruling.length() > 0)
|
||||
.collect(Collectors.toList());
|
||||
|
||||
@ -75,13 +77,40 @@ public class RulingCleaningService {
|
||||
}
|
||||
|
||||
|
||||
private List<List<Rectangle2D>> groupOverlappingRectangles(List<Rectangle2D> rectangles) {
|
||||
private static Ruling getXCenteredRuling(Set<OverlapRectangle> rectList) {
|
||||
|
||||
UnionFind<Rectangle2D> unionFind = new UnionFind<>();
|
||||
Ruling ruling = getXCenteredRuling(rectList.stream()
|
||||
.map(OverlapRectangle::rectangle2D)
|
||||
.collect(RectangleTransformations.collectBBox()));
|
||||
ruling.setStyle(rectList.iterator().next().style);
|
||||
return ruling;
|
||||
}
|
||||
|
||||
|
||||
private static Ruling getYCenteredRuling(Set<OverlapRectangle> rectList) {
|
||||
|
||||
Ruling ruling = getYCenteredRuling(rectList.stream()
|
||||
.map(OverlapRectangle::rectangle2D)
|
||||
.collect(RectangleTransformations.collectBBox()));
|
||||
ruling.setStyle(rectList.iterator().next().style);
|
||||
return ruling;
|
||||
}
|
||||
|
||||
|
||||
private Collection<Set<OverlapRectangle>> groupOverlappingRectangles(List<OverlapRectangle> rectangles) {
|
||||
|
||||
UnionFind<OverlapRectangle> unionFind = new UnionFind<>(new HashSet<>(rectangles));
|
||||
for (int i = 0; i < rectangles.size(); i++) {
|
||||
for (int j = i + 1; j < rectangles.size(); j++) {
|
||||
Rectangle2D rectangle1 = rectangles.get(i);
|
||||
Rectangle2D rectangle2 = rectangles.get(j);
|
||||
|
||||
OverlapRectangle overlapRectangle1 = rectangles.get(i);
|
||||
OverlapRectangle overlapRectangle2 = rectangles.get(j);
|
||||
|
||||
if (!Objects.equals(overlapRectangle1.style, overlapRectangle2.style)) {
|
||||
continue;
|
||||
}
|
||||
Rectangle2D rectangle1 = overlapRectangle1.rectangle2D;
|
||||
Rectangle2D rectangle2 = overlapRectangle2.rectangle2D;
|
||||
|
||||
// we can stop early when we are too far off because of x-y-sorting
|
||||
if (rectangle1.getMaxX() < rectangle2.getMinX() && rectangle1.getMaxY() < rectangle2.getMinY()) {
|
||||
@ -89,21 +118,16 @@ public class RulingCleaningService {
|
||||
}
|
||||
|
||||
if (rectangle1.intersects(rectangle2)) {
|
||||
unionFind.union(rectangle1, rectangle2);
|
||||
unionFind.union(overlapRectangle1, overlapRectangle2);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Map<Rectangle2D, List<Rectangle2D>> groups = new HashMap<>();
|
||||
for (Rectangle2D rectangle : rectangles) {
|
||||
Rectangle2D root = unionFind.find(rectangle);
|
||||
groups.computeIfAbsent(root, k -> new ArrayList<>()).add(rectangle);
|
||||
}
|
||||
return new ArrayList<>(groups.values());
|
||||
return unionFind.getGroups();
|
||||
}
|
||||
|
||||
|
||||
private static Rectangle2D getOverlapRectangle(Ruling ruling) {
|
||||
private static OverlapRectangle getOverlapRectangle(Ruling ruling) {
|
||||
|
||||
float y;
|
||||
float x;
|
||||
@ -124,12 +148,14 @@ public class RulingCleaningService {
|
||||
y = ruling.y2;
|
||||
h = ruling.y1 - ruling.y2;
|
||||
}
|
||||
|
||||
Rectangle2D overlapRectangle;
|
||||
if (ruling.isHorizontal()) {
|
||||
return new Rectangle2D.Double(x - THRESHOLD_X_HORIZONTAL, y - THRESHOLD_Y_HORIZONTAL, w + 2 * THRESHOLD_X_HORIZONTAL, h + 2 * THRESHOLD_Y_HORIZONTAL);
|
||||
overlapRectangle = new Rectangle2D.Double(x - THRESHOLD_X_HORIZONTAL, y - THRESHOLD_Y_HORIZONTAL, w + 2 * THRESHOLD_X_HORIZONTAL, h + 2 * THRESHOLD_Y_HORIZONTAL);
|
||||
} else {
|
||||
return new Rectangle2D.Double(x - THRESHOLD_X_VERTICAL, y - THRESHOLD_Y_VERTICAL, w + 2 * THRESHOLD_X_VERTICAL, h + 2 * THRESHOLD_Y_VERTICAL);
|
||||
overlapRectangle = new Rectangle2D.Double(x - THRESHOLD_X_VERTICAL, y - THRESHOLD_Y_VERTICAL, w + 2 * THRESHOLD_X_VERTICAL, h + 2 * THRESHOLD_Y_VERTICAL);
|
||||
}
|
||||
|
||||
return new OverlapRectangle(overlapRectangle, ruling.getStyle());
|
||||
}
|
||||
|
||||
|
||||
@ -243,4 +269,8 @@ public class RulingCleaningService {
|
||||
|
||||
}
|
||||
|
||||
private record OverlapRectangle(Rectangle2D rectangle2D, Ruling.Style style) {
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@ -12,8 +12,8 @@ import org.springframework.stereotype.Service;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationDocument;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationFooter;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationHeader;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.ClassificationFooter;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.ClassificationHeader;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationSection;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.PageBlockType;
|
||||
@ -30,7 +30,6 @@ import lombok.extern.slf4j.Slf4j;
|
||||
@Deprecated
|
||||
public class SectionsBuilderService {
|
||||
|
||||
|
||||
public void buildSections(ClassificationDocument document) {
|
||||
|
||||
List<AbstractPageBlock> chunkWords = new ArrayList<>();
|
||||
@ -73,8 +72,7 @@ public class SectionsBuilderService {
|
||||
chunkBlockList.add(chunkBlock);
|
||||
chunkWords = new ArrayList<>();
|
||||
if (!chunkBlock.getTables().isEmpty()) {
|
||||
previousTable = chunkBlock.getTables()
|
||||
.get(chunkBlock.getTables().size() - 1);
|
||||
previousTable = chunkBlock.getTables().get(chunkBlock.getTables().size() - 1);
|
||||
}
|
||||
}
|
||||
if (current instanceof TablePageBlock table) {
|
||||
@ -236,12 +234,8 @@ public class SectionsBuilderService {
|
||||
List<Cell> previousTableNonHeaderRow = getRowWithNonHeaderCells(previousTable);
|
||||
List<Cell> tableNonHeaderRow = getRowWithNonHeaderCells(currentTable);
|
||||
// Allow merging of tables if header row is separated from first logical non-header row
|
||||
if (previousTableNonHeaderRow.isEmpty()
|
||||
&& previousTable.getRowCount() == 1
|
||||
&& previousTable.getRows()
|
||||
.get(0).size() == tableNonHeaderRow.size()) {
|
||||
previousTableNonHeaderRow = previousTable.getRows()
|
||||
.get(0)
|
||||
if (previousTableNonHeaderRow.isEmpty() && previousTable.getRowCount() == 1 && previousTable.getRows().get(0).size() == tableNonHeaderRow.size()) {
|
||||
previousTableNonHeaderRow = previousTable.getRows().get(0)
|
||||
.stream()
|
||||
.map(cell -> {
|
||||
Cell fakeCell = Cell.copy(cell);
|
||||
@ -252,8 +246,7 @@ public class SectionsBuilderService {
|
||||
}
|
||||
if (previousTableNonHeaderRow.size() == tableNonHeaderRow.size()) {
|
||||
for (int i = currentTable.getRowCount() - 1; i >= 0; i--) { // Non header rows are most likely at bottom of table
|
||||
List<Cell> row = currentTable.getRows()
|
||||
.get(i);
|
||||
List<Cell> row = currentTable.getRows().get(i);
|
||||
if (row.size() == tableNonHeaderRow.size() && row.stream()
|
||||
.allMatch(cell -> cell.getHeaderCells().isEmpty())) {
|
||||
for (int j = 0; j < row.size(); j++) {
|
||||
@ -272,13 +265,6 @@ public class SectionsBuilderService {
|
||||
|
||||
for (AbstractPageBlock container : wordBlockList) {
|
||||
if (container instanceof TablePageBlock table) {
|
||||
|
||||
if (lastHeadline == null || lastHeadline.isEmpty()) {
|
||||
table.setHeadline("Text in table");
|
||||
} else {
|
||||
table.setHeadline("TablePageBlock in: " + lastHeadline);
|
||||
}
|
||||
|
||||
section.getPageBlocks().add(table);
|
||||
continue;
|
||||
}
|
||||
@ -310,8 +296,7 @@ public class SectionsBuilderService {
|
||||
private List<Cell> getRowWithNonHeaderCells(TablePageBlock table) {
|
||||
|
||||
for (int i = table.getRowCount() - 1; i >= 0; i--) { // Non header rows are most likely at bottom of table
|
||||
List<Cell> row = table.getRows()
|
||||
.get(i);
|
||||
List<Cell> row = table.getRows().get(i);
|
||||
if (row.size() == 1) {
|
||||
continue;
|
||||
}
|
||||
|
||||
@ -1,159 +0,0 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.services;
|
||||
|
||||
import static com.knecon.fforesight.service.layoutparser.processor.utils.GeometricComparators.CELL_SIZE_COMPARATOR;
|
||||
import static com.knecon.fforesight.service.layoutparser.processor.utils.GeometricComparators.RECTANGLE_SIZE_COMPARATOR;
|
||||
|
||||
import java.awt.geom.AffineTransform;
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.BoundingBox;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.Cell;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.TablePageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.RedTextPosition;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.CoordinateTransforms;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.DoubleComparisons;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.PageInformation;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.RectangularIntersectionFinder;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.SpreadsheetFinder;
|
||||
|
||||
import lombok.SneakyThrows;
|
||||
|
||||
@Service
|
||||
public class TableExtractionService {
|
||||
|
||||
private static final int MAX_TABLE_CONTAINED_CELLS_WITH_TEXT = 1;
|
||||
private static final double TABLE_UNIFORMITY_THRESHOLD = 0.7;
|
||||
|
||||
|
||||
/**
|
||||
* Finds tables on a page and moves textblocks into cells of the found tables.
|
||||
* Note: This algorithm uses Pdf Coordinate System where {0,0} rotated with the page rotation.
|
||||
* 0 -> LowerLeft
|
||||
* 90 -> UpperLeft
|
||||
* 180 -> UpperRight
|
||||
* 270 -> LowerRight
|
||||
* <p>
|
||||
* DirAdj (Text direction adjusted) values can not be used here.
|
||||
*
|
||||
* @param emptyCells The cells used to build the table.
|
||||
* @param page Page object that contains textblocks and statistics.
|
||||
*/
|
||||
|
||||
public void extractTables(List<Cell> emptyCells, ClassificationPage page) {
|
||||
|
||||
// sort cells by size (height * width) ascending so that textBlocks are always assigned to the smallest cells that contain them
|
||||
emptyCells.sort(CELL_SIZE_COMPARATOR);
|
||||
|
||||
for (AbstractPageBlock abstractPageBlock : page.getTextBlocks()) {
|
||||
TextPageBlock textBlock = (TextPageBlock) abstractPageBlock;
|
||||
for (Cell cell : emptyCells) {
|
||||
if (cell.hasMinimumSize() && doesCellContainTextBlock(cell, textBlock)) {
|
||||
cell.addTextBlock(textBlock);
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
List<Cell> cells = new ArrayList<>(new HashSet<>(emptyCells));
|
||||
DoubleComparisons.sort(cells, BoundingBox.ILL_DEFINED_ORDER);
|
||||
|
||||
List<Rectangle2D> spreadsheetAreas = SpreadsheetFinder.findSpreadsheetsFromCells(cells);
|
||||
// sort spreadsheetAreas by size (height * width) ascending so that cells are placed in the smallest tables first
|
||||
// this way no cell duplication occurs when tables are contained in other tables and only the most inner table contains the cells
|
||||
spreadsheetAreas.sort(RECTANGLE_SIZE_COMPARATOR);
|
||||
|
||||
List<TablePageBlock> tables = new ArrayList<>();
|
||||
for (Rectangle2D area : spreadsheetAreas) {
|
||||
|
||||
List<Cell> containedCells = new ArrayList<>();
|
||||
for (Cell c : cells) {
|
||||
if (c.hasMinimumSize() && area.contains(c.getBBoxPdf())) {
|
||||
containedCells.add(c);
|
||||
}
|
||||
}
|
||||
|
||||
var containedCellsWithText = containedCells.stream()
|
||||
.filter(cell -> !cell.getTextBlocks().isEmpty())
|
||||
.toList();
|
||||
|
||||
// verify if table would contain fewer cells with text than the threshold allows
|
||||
if (containedCellsWithText.size() >= MAX_TABLE_CONTAINED_CELLS_WITH_TEXT && checkIfTableCellsAreUniform(containedCells)) {
|
||||
tables.add(new TablePageBlock(containedCells, page.getRotation()));
|
||||
cells.removeAll(containedCells);
|
||||
}
|
||||
}
|
||||
|
||||
for (TablePageBlock table : tables) {
|
||||
int position = -1;
|
||||
|
||||
for (AbstractPageBlock pageBlock : page.getTextBlocks()) {
|
||||
if (pageBlock instanceof TextPageBlock ? table.contains(pageBlock) : table.contains(pageBlock) && position == -1) {
|
||||
position = page.getTextBlocks().indexOf(pageBlock);
|
||||
}
|
||||
}
|
||||
if (position != -1) {
|
||||
page.getTextBlocks().add(position, table);
|
||||
|
||||
var toBeRemoved = table.getCells()
|
||||
.stream()
|
||||
.map(Cell::getTextBlocks)
|
||||
.flatMap(List::stream)
|
||||
.toList();
|
||||
// remove text blocks from the page that were also added with the table (from its contained cells)
|
||||
page.getTextBlocks().removeAll(toBeRemoved);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
private boolean checkIfTableCellsAreUniform(List<Cell> containedCells) {
|
||||
|
||||
if (containedCells.size() <= 2) {
|
||||
return true;
|
||||
}
|
||||
|
||||
Map<Long, List<Long>> cellsGroupedByRoundedWidth = containedCells.stream()
|
||||
.map(BoundingBox::getWidth)
|
||||
.map(size -> Math.round(size / 10.0) * 10)
|
||||
.collect(Collectors.groupingBy(Long::longValue));
|
||||
|
||||
return (double) cellsGroupedByRoundedWidth.size() / containedCells.size() <= TABLE_UNIFORMITY_THRESHOLD;
|
||||
}
|
||||
|
||||
|
||||
private boolean doesCellContainTextBlock(Cell cell, TextPageBlock textBlock) {
|
||||
|
||||
return cell.contains(textBlock, RedTextPosition.HEIGHT_PADDING);
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
public static List<Cell> findCells(List<Ruling> horizontalRulingLines, List<Ruling> verticalRulingLines, PageInformation pageInformation) {
|
||||
|
||||
AffineTransform affineTransform = CoordinateTransforms.calculateInitialUserSpaceCoordsToImageCoords(pageInformation, 1);
|
||||
/*
|
||||
switch (pageInformation.rotationDegrees()) {
|
||||
case 90 -> affineTransform.translate(RedTextPosition.HEIGHT_PADDING, 0); //although this is wrong, our text coordinates are wrong as well
|
||||
case 180 -> affineTransform.translate(0, RedTextPosition.HEIGHT_PADDING);
|
||||
case 270 -> affineTransform.translate(-RedTextPosition.HEIGHT_PADDING, 0);
|
||||
default -> affineTransform.translate(0, -RedTextPosition.HEIGHT_PADDING);
|
||||
}
|
||||
*/
|
||||
return RectangularIntersectionFinder.find(horizontalRulingLines, verticalRulingLines)
|
||||
.stream()
|
||||
.map(rect -> new Cell(rect, affineTransform))
|
||||
.collect(Collectors.toList());
|
||||
}
|
||||
|
||||
}
|
||||
@ -18,7 +18,7 @@ import com.knecon.fforesight.service.layoutparser.processor.model.SectionIdentif
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.outline.OutlineObject;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.Word;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.PageInformation;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.PageInformation;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.TextNormalizationUtilities;
|
||||
|
||||
import lombok.Data;
|
||||
|
||||
@ -0,0 +1,42 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.services.blockification;
|
||||
|
||||
import java.util.Collections;
|
||||
import java.util.List;
|
||||
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.Word;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.visualization.LayoutDebugLayer;
|
||||
|
||||
import lombok.AccessLevel;
|
||||
import lombok.RequiredArgsConstructor;
|
||||
import lombok.experimental.FieldDefaults;
|
||||
|
||||
@Service
|
||||
@RequiredArgsConstructor
|
||||
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
|
||||
public class BlockificationService {
|
||||
|
||||
RedactManagerBlockificationService redactManagerBlockificationService;
|
||||
DocstrumBlockificationService docstrumBlockificationService;
|
||||
DocuMineBlockificationService docuMineBlockificationService;
|
||||
|
||||
|
||||
public List<TextPageBlock> blockify(LayoutParsingType layoutParsingType, List<Word> words, CleanRulings cleanRulings, LayoutDebugLayer layoutDebugLayer) {
|
||||
|
||||
if (words.isEmpty()) {
|
||||
return Collections.emptyList();
|
||||
}
|
||||
return switch (layoutParsingType) {
|
||||
case REDACT_MANAGER_OLD -> redactManagerBlockificationService.blockify(words, cleanRulings, layoutDebugLayer);
|
||||
case DOCUMINE_OLD -> docuMineBlockificationService.blockify(words, cleanRulings);
|
||||
case DOCUMINE, REDACT_MANAGER, REDACT_MANAGER_PARAGRAPH_DEBUG, REDACT_MANAGER_WITHOUT_DUPLICATE_PARAGRAPH ->
|
||||
docstrumBlockificationService.blockify(words, cleanRulings, true, layoutDebugLayer, layoutParsingType);
|
||||
case CLARIFYND, CLARIFYND_PARAGRAPH_DEBUG -> docstrumBlockificationService.blockify(words, cleanRulings, false, layoutDebugLayer, layoutParsingType);
|
||||
};
|
||||
}
|
||||
|
||||
}
|
||||
@ -10,7 +10,6 @@ import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsi
|
||||
import com.knecon.fforesight.service.layoutparser.processor.docstrum.DocstrumSegmentationService;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.Zone;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.TablePageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
|
||||
@ -30,46 +29,39 @@ public class DocstrumBlockificationService {
|
||||
static final float THRESHOLD = 1f;
|
||||
|
||||
|
||||
public ClassificationPage blockify(List<Word> textPositions,
|
||||
CleanRulings rulings,
|
||||
boolean xyOrder,
|
||||
LayoutDebugLayer visualizations,
|
||||
LayoutParsingType layoutParsingType) {
|
||||
public List<TextPageBlock> blockify(List<Word> words, CleanRulings rulings, boolean xyOrder, LayoutDebugLayer visualizations, LayoutParsingType layoutParsingType) {
|
||||
|
||||
CleanRulings usedRulings = rulings.withoutTextRulings();
|
||||
CleanRulings rulingsWithoutTextRulings = rulings.withoutTextRulings();
|
||||
|
||||
List<Zone> zones = docstrumSegmentationService.segmentPage(textPositions, xyOrder, usedRulings, visualizations);
|
||||
List<Zone> zones = docstrumSegmentationService.segmentPage(words, xyOrder, rulingsWithoutTextRulings);
|
||||
|
||||
if (!textPositions.isEmpty()) {
|
||||
visualizations.addZoneVisualizations(zones, textPositions.get(0).getPage());
|
||||
visualizations.addLineVisualizationsFromZones(zones, textPositions.get(0).getPage());
|
||||
visualizations.addCharactersWithNeighbours(zones, textPositions.get(0).getPage());
|
||||
if (!words.isEmpty() && visualizations != null) {
|
||||
visualizations.addZoneVisualizations(zones, words.get(0).getPage());
|
||||
visualizations.addLineVisualizationsFromZones(zones, words.get(0).getPage());
|
||||
visualizations.addCharactersWithNeighbours(zones, words.get(0).getPage());
|
||||
}
|
||||
|
||||
var pageBlocks = toAbstractPageBlocks(zones);
|
||||
|
||||
var classificationPage = new ClassificationPage(pageBlocks);
|
||||
classificationPage.setCleanRulings(rulings);
|
||||
|
||||
mergeIntersectingBlocks(classificationPage, usedRulings, 0, 0);
|
||||
mergeIntersectingBlocks(pageBlocks, rulingsWithoutTextRulings, 0, 0);
|
||||
|
||||
if (layoutParsingType == LayoutParsingType.DOCUMINE
|
||||
|| layoutParsingType == LayoutParsingType.REDACT_MANAGER
|
||||
|| layoutParsingType == LayoutParsingType.REDACT_MANAGER_WITHOUT_DUPLICATE_PARAGRAPH) {
|
||||
combineBlocks(classificationPage, layoutParsingType);
|
||||
combineBlocks(pageBlocks, rulings, layoutParsingType);
|
||||
}
|
||||
|
||||
if (layoutParsingType == LayoutParsingType.CLARIFYND) {
|
||||
mergeIntersectingBlocks(classificationPage, usedRulings, 0, 0);
|
||||
mergeIntersectingBlocks(pageBlocks, rulingsWithoutTextRulings, 0, 0);
|
||||
}
|
||||
|
||||
return classificationPage;
|
||||
return pageBlocks;
|
||||
}
|
||||
|
||||
|
||||
private List<AbstractPageBlock> toAbstractPageBlocks(List<Zone> zones) {
|
||||
private List<TextPageBlock> toAbstractPageBlocks(List<Zone> zones) {
|
||||
|
||||
List<AbstractPageBlock> abstractPageBlocks = new ArrayList<>();
|
||||
List<TextPageBlock> abstractPageBlocks = new ArrayList<>();
|
||||
zones.forEach(zone -> {
|
||||
|
||||
List<Word> words = new ArrayList<>();
|
||||
@ -88,29 +80,23 @@ public class DocstrumBlockificationService {
|
||||
}
|
||||
|
||||
|
||||
public void combineBlocks(ClassificationPage page, LayoutParsingType layoutParsingType) {
|
||||
public void combineBlocks(List<TextPageBlock> blocks, CleanRulings rulingsWithoutTextRulings, LayoutParsingType layoutParsingType) {
|
||||
|
||||
TextPageBlock previous = new TextPageBlock();
|
||||
ListIterator<AbstractPageBlock> itty = page.getTextBlocks().listIterator();
|
||||
CleanRulings usedRulings = page.getCleanRulings().withoutTextRulings();
|
||||
ListIterator<TextPageBlock> itty = blocks.listIterator();
|
||||
while (itty.hasNext()) {
|
||||
|
||||
AbstractPageBlock block = itty.next();
|
||||
if (block instanceof TablePageBlock) {
|
||||
previous = new TextPageBlock();
|
||||
continue;
|
||||
}
|
||||
TextPageBlock current = (TextPageBlock) block;
|
||||
TextPageBlock current = itty.next();
|
||||
|
||||
if (previous != null && !previous.getWords().isEmpty()) {
|
||||
|
||||
if (current.getDir() != previous.getDir() || usedRulings.lineBetween(current, previous)) {
|
||||
if (current.getDir() != previous.getDir() || rulingsWithoutTextRulings.lineBetween(current, previous)) {
|
||||
previous = current;
|
||||
continue;
|
||||
}
|
||||
|
||||
if (current.isHeadline() || previous.isHeadline()) {
|
||||
if (intersectsYWithPreviousHavingMaxOneLine(previous, current, page)) {
|
||||
if (intersectsYWithPreviousHavingMaxOneLine(previous, current)) {
|
||||
previous = combineBlocksAndResetIterator(previous, current, itty, false);
|
||||
} else {
|
||||
previous = current;
|
||||
@ -119,7 +105,7 @@ public class DocstrumBlockificationService {
|
||||
continue;
|
||||
}
|
||||
|
||||
if (areTheOnlyTwoBlocksOnHeightsWithBothMoreThanTwoLines(previous, current, page)) {
|
||||
if (areTheOnlyTwoBlocksOnHeightsWithBothMoreThanTwoLines(previous, current, blocks)) {
|
||||
// previous = combineBlocksAndResetIterator(previous, current, itty, true);
|
||||
previous = combineBlocksAndResetIterator(previous, current, itty, layoutParsingType != LayoutParsingType.REDACT_MANAGER_WITHOUT_DUPLICATE_PARAGRAPH);
|
||||
continue;
|
||||
@ -130,12 +116,12 @@ public class DocstrumBlockificationService {
|
||||
continue;
|
||||
}
|
||||
|
||||
if (isSameTopOrBottomWithPreviousHavingMaxTwoLinesAndCurrentThanOneAndMax4OtherBlocksOnHeight(previous, current, page)) {
|
||||
if (isSameTopOrBottomWithPreviousHavingMaxTwoLinesAndCurrentThanOneAndMax4OtherBlocksOnHeight(previous, current, blocks)) {
|
||||
previous = combineBlocksAndResetIterator(previous, current, itty, previous.isToDuplicate());
|
||||
continue;
|
||||
}
|
||||
|
||||
if (isOnlyIntersectingYAndOnelineOrPrevoiusTwoLines(previous, current, page)) {
|
||||
if (isOnlyIntersectingYAndOnelineOrPrevoiusTwoLines(previous, current, blocks)) {
|
||||
previous = combineBlocksAndResetIterator(previous, current, itty, previous.isToDuplicate());
|
||||
continue;
|
||||
}
|
||||
@ -144,43 +130,43 @@ public class DocstrumBlockificationService {
|
||||
previous = current;
|
||||
}
|
||||
|
||||
mergeIntersectingBlocks(page, usedRulings, 0, Y_THRESHOLD);
|
||||
mergeIntersectingBlocks(blocks, rulingsWithoutTextRulings, 0, Y_THRESHOLD);
|
||||
}
|
||||
|
||||
|
||||
private boolean isOnlyIntersectingYAndOnelineOrPrevoiusTwoLines(TextPageBlock previous, TextPageBlock current, ClassificationPage page) {
|
||||
private boolean isOnlyIntersectingYAndOnelineOrPrevoiusTwoLines(TextPageBlock previous, TextPageBlock current, List<? extends AbstractPageBlock> allBlocks) {
|
||||
|
||||
return current.intersectsY(previous) //
|
||||
&& (previous.getNumberOfLines() == 1 && current.getNumberOfLines() >= 1 || previous.getNumberOfLines() == 2 && current.getNumberOfLines() == 1) //
|
||||
&& numberOfYIntersectionsOfSmallerBlocksWithOtherBlocks(current, previous, page.getTextBlocks()) <= 0;
|
||||
&& numberOfYIntersectionsOfSmallerBlocksWithOtherBlocks(current, previous, allBlocks) <= 0;
|
||||
}
|
||||
|
||||
|
||||
private boolean isSameTopOrBottomWithPreviousHavingMaxTwoLinesAndCurrentThanOneAndMax4OtherBlocksOnHeight(TextPageBlock previous,
|
||||
TextPageBlock current,
|
||||
ClassificationPage page) {
|
||||
List<? extends AbstractPageBlock> allBlocks) {
|
||||
|
||||
return (Math.abs(previous.getMaxY() - current.getMaxY()) < THRESHOLD || Math.abs(previous.getMinY() - current.getMinY()) < THRESHOLD) //
|
||||
&& (previous.getNumberOfLines() == 1 && current.getNumberOfLines() >= 1 || previous.getNumberOfLines() >= 2 && current.getNumberOfLines() == 1) //
|
||||
&& !hasBetween(current, previous, page.getTextBlocks()) && numberOfYIntersectionsOfSmallerBlocksWithOtherBlocks(current, previous, page.getTextBlocks()) <= 4;
|
||||
&& !hasBetween(current, previous, allBlocks) && numberOfYIntersectionsOfSmallerBlocksWithOtherBlocks(current, previous, allBlocks) <= 4;
|
||||
}
|
||||
|
||||
|
||||
private boolean intersectsYWithPreviousHavingMaxOneLine(TextPageBlock previous, TextPageBlock current, ClassificationPage page) {
|
||||
private boolean intersectsYWithPreviousHavingMaxOneLine(TextPageBlock previous, TextPageBlock current) {
|
||||
|
||||
return previous.intersectsY(current) && (previous.getNumberOfLines() == 1 && current.getNumberOfLines() >= 1);
|
||||
}
|
||||
|
||||
|
||||
private boolean areTheOnlyTwoBlocksOnHeightsWithBothMoreThanTwoLines(TextPageBlock previous, TextPageBlock current, ClassificationPage page) {
|
||||
private boolean areTheOnlyTwoBlocksOnHeightsWithBothMoreThanTwoLines(TextPageBlock previous, TextPageBlock current, List<TextPageBlock> allBlocks) {
|
||||
|
||||
return previous.getNumberOfLines() >= 2 && current.getNumberOfLines() >= 2 //
|
||||
&& previous.intersectsY(current) //
|
||||
&& numberOfYIntersectionsOfSmallerBlocksWithOtherBlocks(current, previous, page.getTextBlocks()) == 0;
|
||||
&& numberOfYIntersectionsOfSmallerBlocksWithOtherBlocks(current, previous, allBlocks) == 0;
|
||||
}
|
||||
|
||||
|
||||
private TextPageBlock combineBlocksAndResetIterator(TextPageBlock previous, TextPageBlock current, ListIterator<AbstractPageBlock> itty, boolean toDuplicate) {
|
||||
private TextPageBlock combineBlocksAndResetIterator(TextPageBlock previous, TextPageBlock current, ListIterator<TextPageBlock> itty, boolean toDuplicate) {
|
||||
|
||||
previous.addAll(current.getWords());
|
||||
previous = buildTextBlock(previous.getWords(), 0);
|
||||
@ -196,7 +182,7 @@ public class DocstrumBlockificationService {
|
||||
}
|
||||
|
||||
|
||||
private boolean hasBetween(TextPageBlock block, TextPageBlock other, List<AbstractPageBlock> allBlocks) {
|
||||
private boolean hasBetween(TextPageBlock block, TextPageBlock other, List<? extends AbstractPageBlock> allBlocks) {
|
||||
|
||||
for (AbstractPageBlock current : allBlocks) {
|
||||
|
||||
@ -213,7 +199,7 @@ public class DocstrumBlockificationService {
|
||||
}
|
||||
|
||||
|
||||
private int numberOfYIntersectionsOfSmallerBlocksWithOtherBlocks(TextPageBlock block, TextPageBlock other, List<AbstractPageBlock> allBlocks) {
|
||||
private int numberOfYIntersectionsOfSmallerBlocksWithOtherBlocks(TextPageBlock block, TextPageBlock other, List<? extends AbstractPageBlock> allBlocks) {
|
||||
|
||||
double minY = Math.min(block.getMinY(), other.getMinY());
|
||||
double maxY = Math.min(block.getMaxY(), other.getMaxY());
|
||||
@ -234,25 +220,18 @@ public class DocstrumBlockificationService {
|
||||
}
|
||||
|
||||
|
||||
public void mergeIntersectingBlocks(ClassificationPage page, CleanRulings usedRulings, float xThreshold, float yThreshold) {
|
||||
public void mergeIntersectingBlocks(List<TextPageBlock> blocks, CleanRulings usedRulings, float xThreshold, float yThreshold) {
|
||||
|
||||
var blocks = page.getTextBlocks();
|
||||
ListIterator<AbstractPageBlock> itty = blocks.listIterator();
|
||||
ListIterator<TextPageBlock> itty = blocks.listIterator();
|
||||
while (itty.hasNext()) {
|
||||
AbstractPageBlock block = itty.next();
|
||||
if (block == null) {
|
||||
continue;
|
||||
}
|
||||
if (block instanceof TablePageBlock) {
|
||||
TextPageBlock current = itty.next();
|
||||
if (current == null) {
|
||||
continue;
|
||||
}
|
||||
|
||||
if (block.getClassification() != null && block.getClassification().isHeadline()) {
|
||||
if (current.getClassification() != null && current.getClassification().isHeadline()) {
|
||||
continue;
|
||||
}
|
||||
|
||||
TextPageBlock current = (TextPageBlock) block;
|
||||
|
||||
for (int i = 0; i < blocks.size(); i++) {
|
||||
|
||||
AbstractPageBlock abstractPageBlock = blocks.get(i);
|
||||
|
||||
@ -33,14 +33,14 @@ public class DocuMineBlockificationService {
|
||||
* This method must use text direction adjusted postions (DirAdj). Where {0,0} is on the upper left. Never try to change this!
|
||||
* Rulings (Table lines) must be adjusted to the text directions as well, when checking if a block is split by a ruling.
|
||||
*
|
||||
* @param textPositions The textPositions of a page.
|
||||
* @param words The words of a page.
|
||||
* @param cleanRulings All rulings on a page
|
||||
* @return Page object that contains the Textblock and text statistics.
|
||||
*/
|
||||
public ClassificationPage blockify(List<Word> textPositions, CleanRulings cleanRulings) {
|
||||
public List<TextPageBlock> blockify(List<Word> words, CleanRulings cleanRulings) {
|
||||
|
||||
List<Word> chunkWords = new ArrayList<>();
|
||||
List<AbstractPageBlock> textPageBlocks = new ArrayList<>();
|
||||
List<TextPageBlock> textPageBlocks = new ArrayList<>();
|
||||
|
||||
CleanRulings usedRulings = cleanRulings.withoutTextRulings();
|
||||
|
||||
@ -52,7 +52,7 @@ public class DocuMineBlockificationService {
|
||||
|
||||
boolean wasSplitted = false;
|
||||
Double splitX1 = null;
|
||||
for (Word word : textPositions) {
|
||||
for (Word word : words) {
|
||||
|
||||
boolean lineSeparation = prev != null && word.getYDirAdj() - prev.getMaxYDirAdj() > Math.min(word.getHeight(), prev.getHeight()) * 1.1;
|
||||
boolean startFromTop = prev != null && word.getYDirAdj() < prev.getYDirAdj() - prev.getTextHeight();
|
||||
@ -120,7 +120,7 @@ public class DocuMineBlockificationService {
|
||||
|
||||
textPageBlocks.add(new TextPageBlock(chunkWords));
|
||||
|
||||
return new ClassificationPage(textPageBlocks);
|
||||
return textPageBlocks;
|
||||
}
|
||||
|
||||
|
||||
@ -171,8 +171,9 @@ public class DocuMineBlockificationService {
|
||||
continue;
|
||||
}
|
||||
|
||||
if (current.getDir() == inner.getDir() && current.intersects(inner, yThreshold, xThreshold) && (current.getClassification() == null || current.getClassification()
|
||||
.equals(inner.getClassification()))) {
|
||||
if (current.getDir() == inner.getDir() &&//
|
||||
current.intersects(inner, yThreshold, xThreshold) &&//
|
||||
(current.getClassification() == null || current.getClassification().equals(inner.getClassification()))) {
|
||||
|
||||
boolean toDuplicate = current.isToDuplicate() || inner.isToDuplicate();
|
||||
current.addAll(inner.getWords());
|
||||
|
||||
@ -26,24 +26,24 @@ public class RedactManagerBlockificationService {
|
||||
* This method must use text direction adjusted postions (DirAdj). Where {0,0} is on the upper left. Never try to change this!
|
||||
* Rulings (TablePageBlock lines) must be adjusted to the text directions as well, when checking if a block is split by a ruling.
|
||||
*
|
||||
* @param textPositions The words of a page.
|
||||
* @param words The words of a page.
|
||||
* @param visualizations
|
||||
* @return Page object that contains the Textblock and text statistics.
|
||||
*/
|
||||
public ClassificationPage blockify(List<Word> textPositions, CleanRulings cleanRulings, LayoutDebugLayer visualizations) {
|
||||
public List<TextPageBlock> blockify(List<Word> words, CleanRulings cleanRulings, LayoutDebugLayer visualizations) {
|
||||
|
||||
CleanRulings usedRulings = cleanRulings.withoutTextRulings();
|
||||
|
||||
int indexOnPage = 0;
|
||||
List<Word> chunkWords = new ArrayList<>();
|
||||
List<AbstractPageBlock> chunkBlockList = new ArrayList<>();
|
||||
List<TextPageBlock> chunkBlockList = new ArrayList<>();
|
||||
|
||||
double minX = 1000, maxX = 0, minY = 1000, maxY = 0;
|
||||
Word prev = null;
|
||||
|
||||
boolean wasSplitted = false;
|
||||
Double splitX1 = null;
|
||||
for (Word word : textPositions) {
|
||||
for (Word word : words) {
|
||||
|
||||
boolean lineSeparation = word.getYDirAdj() - maxY > word.getHeight() * 1.25;
|
||||
boolean startFromTop = prev != null && word.getYDirAdj() < prev.getYDirAdj() - prev.getTextHeight();
|
||||
@ -111,7 +111,7 @@ public class RedactManagerBlockificationService {
|
||||
chunkBlockList.add(cb1);
|
||||
}
|
||||
|
||||
Iterator<AbstractPageBlock> itty = chunkBlockList.iterator();
|
||||
Iterator<TextPageBlock> itty = chunkBlockList.iterator();
|
||||
|
||||
TextPageBlock previousLeft = null;
|
||||
TextPageBlock previousRight = null;
|
||||
@ -159,12 +159,12 @@ public class RedactManagerBlockificationService {
|
||||
|
||||
previous = block;
|
||||
}
|
||||
if (!textPositions.isEmpty()) {
|
||||
if (!words.isEmpty() && visualizations != null) {
|
||||
visualizations.addTextBlockVisualizations(chunkBlockList.stream()
|
||||
.toList(), textPositions.get(0).getPage());
|
||||
.toList(), words.get(0).getPage());
|
||||
}
|
||||
|
||||
return new ClassificationPage(chunkBlockList);
|
||||
return chunkBlockList;
|
||||
}
|
||||
|
||||
|
||||
|
||||
@ -19,7 +19,7 @@ import java.util.stream.Collectors;
|
||||
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.UnionFind;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.UnionFind;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationDocument;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
|
||||
|
||||
@ -5,7 +5,6 @@ import static java.util.stream.Collectors.groupingBy;
|
||||
import static java.util.stream.Collectors.toList;
|
||||
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collection;
|
||||
import java.util.HashMap;
|
||||
import java.util.LinkedList;
|
||||
@ -15,7 +14,6 @@ import java.util.NoSuchElementException;
|
||||
import java.util.Objects;
|
||||
import java.util.Optional;
|
||||
import java.util.Set;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import com.iqser.red.service.redaction.v1.server.model.document.DocumentTree;
|
||||
import com.iqser.red.service.redaction.v1.server.model.document.nodes.AbstractSemanticNode;
|
||||
@ -36,8 +34,8 @@ import com.iqser.red.service.redaction.v1.server.model.document.textblock.TextBl
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationDocument;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationFooter;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationHeader;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.ClassificationFooter;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.ClassificationHeader;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.DocumentWithVisualization;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.PageBlockType;
|
||||
@ -112,9 +110,7 @@ public class DocumentGraphFactory {
|
||||
|
||||
public void addParagraphOrHeadline(GenericSemanticNode parentNode,
|
||||
TextPageBlock originalTextBlock,
|
||||
Context context,
|
||||
List<TextPageBlock> textBlocksToMerge,
|
||||
LayoutParsingType layoutParsingType) {
|
||||
Context context, LayoutParsingType layoutParsingType) {
|
||||
|
||||
Page page = context.getPage(originalTextBlock.getPage());
|
||||
|
||||
@ -129,17 +125,10 @@ public class DocumentGraphFactory {
|
||||
node = Paragraph.builder().documentTree(context.getDocumentTree()).build();
|
||||
}
|
||||
|
||||
List<TextPageBlock> textBlocks = new ArrayList<>();
|
||||
textBlocks.add(originalTextBlock);
|
||||
textBlocks.addAll(textBlocksToMerge);
|
||||
|
||||
AtomicTextBlock textBlock = context.textBlockFactory.buildAtomicTextBlock(TextPositionOperations.mergeAndSort(textBlocks), node, context, page);
|
||||
AtomicTextBlock textBlock = context.textBlockFactory.buildAtomicTextBlock(TextPositionOperations.mergeAndSort(originalTextBlock), node, context, page);
|
||||
|
||||
if (node instanceof DuplicatedParagraph duplicatedParagraph) {
|
||||
AtomicTextBlock unsortedTextBlock = context.textBlockFactory.buildAtomicTextBlock(textBlocks.stream()
|
||||
.flatMap(tb -> tb.getWords()
|
||||
.stream())
|
||||
.collect(Collectors.toList()), node, context, page);
|
||||
AtomicTextBlock unsortedTextBlock = context.textBlockFactory.buildAtomicTextBlock(originalTextBlock.getWords(), node, context, page);
|
||||
duplicatedParagraph.setUnsortedLeafTextBlock(unsortedTextBlock);
|
||||
}
|
||||
|
||||
|
||||
@ -29,19 +29,19 @@ public class SearchTextWithTextPositionFactory {
|
||||
public static final double LINEBREAK_DELTA_TOLERANCE = 1.5;
|
||||
|
||||
|
||||
public SearchTextWithTextPositionDto buildSearchTextToTextPositionDto(List<Word> sequences) {
|
||||
public SearchTextWithTextPositionDto buildSearchTextToTextPositionDto(List<Word> words) {
|
||||
|
||||
if (sequences.isEmpty() || sequences.stream()
|
||||
if (words.isEmpty() || words.stream()
|
||||
.allMatch(sequence -> sequence.getCharacters().isEmpty())) {
|
||||
return SearchTextWithTextPositionDto.empty();
|
||||
}
|
||||
|
||||
Context context = new Context();
|
||||
|
||||
RedTextPosition currentTextPosition = sequences.get(0).getCharacters().get(0).getTextPosition();
|
||||
RedTextPosition currentTextPosition = words.get(0).getCharacters().get(0).getTextPosition();
|
||||
RedTextPosition previousTextPosition = RedTextPosition.builder().unicode(" ").bBoxDirAdj(currentTextPosition.getBBoxDirAdj()).build();
|
||||
|
||||
for (Word word : sequences) {
|
||||
for (Word word : words) {
|
||||
for (int i = 0; i < word.getCharacters().size(); ++i) {
|
||||
|
||||
currentTextPosition = word.getCharacters().get(i).getTextPosition();
|
||||
@ -66,7 +66,7 @@ public class SearchTextWithTextPositionFactory {
|
||||
++context.stringIdx;
|
||||
}
|
||||
|
||||
List<Rectangle2D> positions = sequences.stream()
|
||||
List<Rectangle2D> positions = words.stream()
|
||||
.map(Word::getCharacters)
|
||||
.flatMap(Collection::stream)
|
||||
.map(Character::getTextPosition)
|
||||
|
||||
@ -1,12 +1,12 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.services.factory;
|
||||
|
||||
import static java.lang.String.format;
|
||||
import static java.util.Collections.emptyList;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashSet;
|
||||
import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
import java.util.Objects;
|
||||
import java.util.Optional;
|
||||
import java.util.Set;
|
||||
|
||||
@ -17,12 +17,13 @@ import com.iqser.red.service.redaction.v1.server.model.document.nodes.Section;
|
||||
import com.iqser.red.service.redaction.v1.server.model.document.nodes.SuperSection;
|
||||
import com.iqser.red.service.redaction.v1.server.model.document.nodes.TableOfContents;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.UnionFind;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.image.ClassifiedImage;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.outline.SectionTreeEntry;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.TablePageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.TableMergingUtility;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.tables.TableMergingUtility;
|
||||
|
||||
import lombok.experimental.UtilityClass;
|
||||
|
||||
@ -60,7 +61,7 @@ public class SectionNodeFactory {
|
||||
|
||||
section.setTreeId(getTreeId(parentNode, context, section));
|
||||
|
||||
addFirstHeadlineDirectlyToSection(layoutParsingType, pageBlocks, context, section, document);
|
||||
addFirstHeadlineDirectlyToSection(layoutParsingType, pageBlocks, context, section);
|
||||
boolean containsTablesAndTextBlocks = containsTablesAndTextBlocks(pageBlocks);
|
||||
if (containsTablesAndTextBlocks) {
|
||||
splitPageBlocksIntoSubSections(pageBlocks).forEach(subSectionPageBlocks -> addSection(layoutParsingType,
|
||||
@ -73,8 +74,13 @@ public class SectionNodeFactory {
|
||||
} else if (type.equals(SectionTreeEntry.Type.SUPER_SECTION)) {
|
||||
// If a SuperSection contains more blocks than just a headline, we add a Section which contains the remaining textblocks.
|
||||
addSection(layoutParsingType, section, SectionTreeEntry.Type.SECTION, pageBlocks, emptyList(), context, document);
|
||||
} else if (!pageBlocks.isEmpty() && pageBlocks.get(0) instanceof TextPageBlock) {
|
||||
List<TextPageBlock> textPageBlocks = pageBlocks.stream()
|
||||
.map(block -> (TextPageBlock) block)
|
||||
.toList();
|
||||
addParagraphsAndHeadlinesToSection(layoutParsingType, textPageBlocks, context, section);
|
||||
} else {
|
||||
addTablesAndParagraphsAndHeadlinesToSection(layoutParsingType, pageBlocks, context, section, document);
|
||||
addTablesToSection(pageBlocks, context, section, document, layoutParsingType);
|
||||
}
|
||||
|
||||
images.stream()
|
||||
@ -85,6 +91,28 @@ public class SectionNodeFactory {
|
||||
}
|
||||
|
||||
|
||||
private static void addTablesToSection(List<AbstractPageBlock> pageBlocks,
|
||||
DocumentGraphFactory.Context context,
|
||||
AbstractSemanticNode section,
|
||||
Document document,
|
||||
LayoutParsingType layoutParsingType) {
|
||||
|
||||
List<AbstractPageBlock> remainingBlocks = new ArrayList<>(pageBlocks);
|
||||
Set<AbstractPageBlock> alreadyMerged = new HashSet<>();
|
||||
for (AbstractPageBlock abstractPageBlock : pageBlocks) {
|
||||
if (alreadyMerged.contains(abstractPageBlock)) {
|
||||
continue;
|
||||
}
|
||||
if (abstractPageBlock instanceof TablePageBlock tablePageBlock) {
|
||||
List<TablePageBlock> tablesToMerge = TableMergingUtility.findConsecutiveTablesWithSameColCountAndSameHeaders(tablePageBlock, remainingBlocks);
|
||||
alreadyMerged.addAll(tablesToMerge);
|
||||
remainingBlocks.removeAll(tablesToMerge);
|
||||
TableNodeFactory.addTable(layoutParsingType, section, tablesToMerge, context, document);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
private List<Integer> getTreeId(GenericSemanticNode parentNode, DocumentGraphFactory.Context context, AbstractSemanticNode section) {
|
||||
|
||||
if (parentNode == null) {
|
||||
@ -98,54 +126,63 @@ public class SectionNodeFactory {
|
||||
private void addFirstHeadlineDirectlyToSection(LayoutParsingType layoutParsingType,
|
||||
List<AbstractPageBlock> pageBlocks,
|
||||
DocumentGraphFactory.Context context,
|
||||
AbstractSemanticNode section,
|
||||
Document document) {
|
||||
AbstractSemanticNode section) {
|
||||
|
||||
if (pageBlocks.get(0).isHeadline()) {
|
||||
addTablesAndParagraphsAndHeadlinesToSection(layoutParsingType, List.of(pageBlocks.get(0)), context, section, document);
|
||||
addParagraphsAndHeadlinesToSection(layoutParsingType, List.of((TextPageBlock) pageBlocks.get(0)), context, section);
|
||||
pageBlocks.remove(0);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
private void addTablesAndParagraphsAndHeadlinesToSection(LayoutParsingType layoutParsingType,
|
||||
List<AbstractPageBlock> pageBlocks,
|
||||
DocumentGraphFactory.Context context,
|
||||
AbstractSemanticNode section,
|
||||
Document document) {
|
||||
private void addParagraphsAndHeadlinesToSection(LayoutParsingType layoutParsingType,
|
||||
List<TextPageBlock> pageBlocks,
|
||||
DocumentGraphFactory.Context context,
|
||||
AbstractSemanticNode section) {
|
||||
|
||||
Set<AbstractPageBlock> alreadyMerged = new HashSet<>();
|
||||
List<AbstractPageBlock> remainingBlocks = new LinkedList<>(pageBlocks);
|
||||
for (AbstractPageBlock abstractPageBlock : pageBlocks) {
|
||||
List<TextPageBlock> mergedPageBlocks = pageBlocks;
|
||||
if (pageBlocks.size() > 1 && (layoutParsingType.equals(LayoutParsingType.DOCUMINE_OLD) || layoutParsingType.equals(LayoutParsingType.REDACT_MANAGER_OLD))) {
|
||||
mergedPageBlocks = mergeBlocks(pageBlocks);
|
||||
}
|
||||
|
||||
if (alreadyMerged.contains(abstractPageBlock)) {
|
||||
continue;
|
||||
}
|
||||
for (TextPageBlock textPageBlock : mergedPageBlocks) {
|
||||
DocumentGraphFactory.addParagraphOrHeadline(section, textPageBlock, context, layoutParsingType);
|
||||
}
|
||||
}
|
||||
|
||||
remainingBlocks.removeAll(alreadyMerged);
|
||||
|
||||
if (abstractPageBlock instanceof TextPageBlock) {
|
||||
private static List<TextPageBlock> mergeBlocks(List<TextPageBlock> pageBlocks) {
|
||||
|
||||
switch (layoutParsingType) {
|
||||
case REDACT_MANAGER, DOCUMINE, CLARIFYND, CLARIFYND_PARAGRAPH_DEBUG, REDACT_MANAGER_WITHOUT_DUPLICATE_PARAGRAPH -> {
|
||||
alreadyMerged.add(abstractPageBlock);
|
||||
remainingBlocks.remove(abstractPageBlock);
|
||||
DocumentGraphFactory.addParagraphOrHeadline(section, (TextPageBlock) abstractPageBlock, context, new ArrayList<>(), layoutParsingType);
|
||||
}
|
||||
default -> {
|
||||
List<TextPageBlock> textBlocks = findTextBlocksWithSameClassificationAndAlignsY((TextPageBlock) abstractPageBlock, remainingBlocks);
|
||||
alreadyMerged.addAll(textBlocks);
|
||||
DocumentGraphFactory.addParagraphOrHeadline(section, (TextPageBlock) abstractPageBlock, context, textBlocks, layoutParsingType);
|
||||
}
|
||||
UnionFind<TextPageBlock> blockUnionFind = new UnionFind<>(new HashSet<>(pageBlocks));
|
||||
for (int i = 0; i < pageBlocks.size(); i++) {
|
||||
TextPageBlock textPageBlock1 = pageBlocks.get(i);
|
||||
for (int j = i; j < pageBlocks.size(); j++) {
|
||||
if (i == j) {
|
||||
continue;
|
||||
}
|
||||
} else if (abstractPageBlock instanceof TablePageBlock tablePageBlock) {
|
||||
List<TablePageBlock> tablesToMerge = TableMergingUtility.findConsecutiveTablesWithSameColCountAndSameHeaders(tablePageBlock, remainingBlocks);
|
||||
alreadyMerged.addAll(tablesToMerge);
|
||||
TableNodeFactory.addTable(layoutParsingType, section, tablesToMerge, context, document);
|
||||
} else {
|
||||
throw new RuntimeException(format("Unhandled AbstractPageBlockType %s!", abstractPageBlock.getClass()));
|
||||
var textPageBlock2 = pageBlocks.get(j);
|
||||
if (!Objects.equals(textPageBlock2.getPage(), textPageBlock1.getPage())) {
|
||||
continue;
|
||||
}
|
||||
if (!Objects.equals(textPageBlock2.getDir(), textPageBlock1.getDir())) {
|
||||
continue;
|
||||
}
|
||||
if (!Objects.equals(textPageBlock2.getClassification(), textPageBlock1.getClassification())) {
|
||||
continue;
|
||||
}
|
||||
if (!textPageBlock2.intersectsYPdf(textPageBlock1)) {
|
||||
continue;
|
||||
}
|
||||
if (textPageBlock2.isToDuplicate()) {
|
||||
continue;
|
||||
}
|
||||
blockUnionFind.union(textPageBlock2, textPageBlock1);
|
||||
}
|
||||
}
|
||||
return blockUnionFind.getGroups()
|
||||
.stream()
|
||||
.map(TextPageBlock::merge)
|
||||
.toList();
|
||||
}
|
||||
|
||||
|
||||
@ -222,18 +259,4 @@ public class SectionNodeFactory {
|
||||
return splitList;
|
||||
}
|
||||
|
||||
|
||||
private List<TextPageBlock> findTextBlocksWithSameClassificationAndAlignsY(TextPageBlock atc, List<AbstractPageBlock> pageBlocks) {
|
||||
|
||||
return pageBlocks.stream()
|
||||
.filter(abstractTextContainer -> !abstractTextContainer.equals(atc))
|
||||
.filter(abstractTextContainer -> abstractTextContainer.getPage() == atc.getPage())
|
||||
.filter(abstractTextContainer -> abstractTextContainer instanceof TextPageBlock)
|
||||
.filter(abstractTextContainer -> abstractTextContainer.intersectsYPdf(atc))
|
||||
.map(abstractTextContainer -> (TextPageBlock) abstractTextContainer)
|
||||
.filter(abstractTextContainer -> abstractTextContainer.getDir() == atc.getDir())
|
||||
.filter(abstractTextContainer -> !abstractTextContainer.isToDuplicate())
|
||||
.toList();
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@ -4,7 +4,6 @@ import static java.util.Collections.emptyList;
|
||||
|
||||
import java.util.Collection;
|
||||
import java.util.List;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import com.iqser.red.service.redaction.v1.server.model.document.nodes.Document;
|
||||
import com.iqser.red.service.redaction.v1.server.model.document.nodes.GenericSemanticNode;
|
||||
@ -17,6 +16,7 @@ import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBl
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.outline.SectionTreeEntry;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.Cell;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.TablePageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.Word;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.TextPositionOperations;
|
||||
|
||||
@ -50,8 +50,6 @@ public class TableNodeFactory {
|
||||
List<Integer> treeId = context.getDocumentTree().createNewChildEntryAndReturnId(parentNode, table);
|
||||
table.setTreeId(treeId);
|
||||
addTableCells(layoutParsingType, mergedRows, table, context, document);
|
||||
|
||||
ifTableHasNoHeadersSetFirstRowAsHeaders(table);
|
||||
}
|
||||
|
||||
|
||||
@ -76,16 +74,6 @@ public class TableNodeFactory {
|
||||
}
|
||||
|
||||
|
||||
private void ifTableHasNoHeadersSetFirstRowAsHeaders(Table table) {
|
||||
|
||||
if (table.streamHeaders()
|
||||
.findAny().isEmpty()) {
|
||||
table.streamRow(0)
|
||||
.forEach(tableCellNode -> tableCellNode.setHeader(true));
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
private void addTableCells(LayoutParsingType layoutParsingType, List<List<Cell>> rows, Table table, DocumentGraphFactory.Context context, Document document) {
|
||||
|
||||
for (int rowIndex = 0; rowIndex < rows.size(); rowIndex++) {
|
||||
@ -115,32 +103,32 @@ public class TableNodeFactory {
|
||||
TextBlock textBlock;
|
||||
if (cell.getTextBlocks().isEmpty()) {
|
||||
tableCell.setLeafTextBlock(context.getTextBlockFactory().emptyTextBlock(tableNode, context, page));
|
||||
} else if (cell.getTextBlocks().size() == 1) {
|
||||
textBlock = context.getTextBlockFactory().buildAtomicTextBlock(cell.getTextBlocks().get(0).getWords(), tableCell, context, page);
|
||||
} else if (cell.getTextBlocks().size() == 1 && cell.getTextBlocks().get(0) instanceof TextPageBlock textPageBlock) {
|
||||
textBlock = context.getTextBlockFactory().buildAtomicTextBlock(textPageBlock.getWords(), tableCell, context, page);
|
||||
tableCell.setLeafTextBlock(textBlock);
|
||||
} else if (firstTextBlockIsHeadline(cell)) {
|
||||
SectionNodeFactory.addSection(layoutParsingType,
|
||||
tableCell,
|
||||
SectionTreeEntry.Type.SECTION,
|
||||
cell.getTextBlocks()
|
||||
.stream()
|
||||
.map(tb -> (AbstractPageBlock) tb)
|
||||
.collect(Collectors.toList()),
|
||||
emptyList(),
|
||||
context,
|
||||
document);
|
||||
} else if (cellAreaIsSmallerThanPageAreaTimesThreshold(cell, page)) {
|
||||
List<Word> sequences = TextPositionOperations.mergeAndSort(cell.getTextBlocks());
|
||||
textBlock = context.getTextBlockFactory().buildAtomicTextBlock(sequences, tableCell, context, page);
|
||||
} else if (firstTextBlockIsHeadline(cell) || containsTables(cell.getTextBlocks())) {
|
||||
SectionNodeFactory.addSection(layoutParsingType, tableCell, SectionTreeEntry.Type.SECTION, cell.getTextBlocks(), emptyList(), context, document);
|
||||
} else if (cellAreaIsSmallerThanThreshold(cell, page)) {
|
||||
List<Word> words = TextPositionOperations.sort(cell.getWords());
|
||||
textBlock = context.getTextBlockFactory().buildAtomicTextBlock(words, tableCell, context, page);
|
||||
tableCell.setLeafTextBlock(textBlock);
|
||||
} else {
|
||||
cell.getTextBlocks()
|
||||
.forEach(tb -> DocumentGraphFactory.addParagraphOrHeadline(tableCell, tb, context, emptyList(), layoutParsingType));
|
||||
.stream()
|
||||
.map(block -> (TextPageBlock) block)
|
||||
.forEach(tb -> DocumentGraphFactory.addParagraphOrHeadline(tableCell, tb, context, layoutParsingType));
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
private boolean cellAreaIsSmallerThanPageAreaTimesThreshold(Cell cell, Page page) {
|
||||
private boolean containsTables(List<AbstractPageBlock> pageBlocks) {
|
||||
|
||||
return pageBlocks.stream()
|
||||
.anyMatch(pageBlock -> pageBlock instanceof TablePageBlock);
|
||||
}
|
||||
|
||||
|
||||
private boolean cellAreaIsSmallerThanThreshold(Cell cell, Page page) {
|
||||
|
||||
return cell.getArea() < TABLE_CELL_MERGE_CONTENTS_SIZE_THRESHOLD * page.getHeight() * page.getWidth();
|
||||
}
|
||||
|
||||
@ -18,16 +18,16 @@ public class TextBlockFactory {
|
||||
long textBlockIdx;
|
||||
|
||||
|
||||
public AtomicTextBlock buildAtomicTextBlock(List<Word> sequences, SemanticNode parent, DocumentGraphFactory.Context context, Page page) {
|
||||
public AtomicTextBlock buildAtomicTextBlock(List<Word> words, SemanticNode parent, DocumentGraphFactory.Context context, Page page) {
|
||||
|
||||
Integer numberOnPage = context.getAndIncrementTextBlockNumberOnPage(page);
|
||||
return buildAtomicTextBlock(sequences, parent, numberOnPage, page);
|
||||
return buildAtomicTextBlock(words, parent, numberOnPage, page);
|
||||
}
|
||||
|
||||
|
||||
public AtomicTextBlock buildAtomicTextBlock(List<Word> sequences, SemanticNode parent, Integer numberOnPage, Page page) {
|
||||
public AtomicTextBlock buildAtomicTextBlock(List<Word> words, SemanticNode parent, Integer numberOnPage, Page page) {
|
||||
|
||||
SearchTextWithTextPositionDto searchTextWithTextPositionDto = SearchTextWithTextPositionFactory.buildSearchTextToTextPositionDto(sequences);
|
||||
SearchTextWithTextPositionDto searchTextWithTextPositionDto = SearchTextWithTextPositionFactory.buildSearchTextToTextPositionDto(words);
|
||||
int offset = stringOffset;
|
||||
stringOffset += searchTextWithTextPositionDto.getSearchText().length();
|
||||
long idx = textBlockIdx;
|
||||
|
||||
@ -11,14 +11,15 @@ import java.util.stream.Collectors;
|
||||
import org.apache.pdfbox.pdmodel.PDDocument;
|
||||
import org.apache.pdfbox.rendering.ImageType;
|
||||
import org.apache.pdfbox.rendering.PDFRenderer;
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.CoordinateTransforms;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.PageInformation;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.PageInformation;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations;
|
||||
|
||||
import lombok.SneakyThrows;
|
||||
import lombok.experimental.UtilityClass;
|
||||
|
||||
@Service
|
||||
@UtilityClass
|
||||
public class FindGraphicsRaster {
|
||||
|
||||
// Pixels that are lighter then this threshold are ignored
|
||||
@ -33,7 +34,8 @@ public class FindGraphicsRaster {
|
||||
|
||||
var renderer = new PDFRenderer(doc);
|
||||
var img = renderer.renderImageWithDPI(pageInformation.number() - 1, DPI, ImageType.GRAY);
|
||||
var imageCtm = CoordinateTransforms.calculateImageCoordsToInitialUserSpaceCoords(pageInformation, CoordinateTransforms.calculateScalingFactor(pageInformation, img.getWidth()));
|
||||
var imageCtm = CoordinateTransforms.calculateImageCoordsToInitialUserSpaceCoords(pageInformation,
|
||||
CoordinateTransforms.calculateScalingFactor(pageInformation, img.getWidth()));
|
||||
return findCCBoundingBoxes(img, remove, THRESHOLD, DPI / 72, imageCtm);
|
||||
}
|
||||
|
||||
@ -47,13 +49,15 @@ public class FindGraphicsRaster {
|
||||
var w = image.getWidth();
|
||||
var pixels = new int[w * h];
|
||||
image.getRaster().getPixels(0, 0, w, h, pixels);
|
||||
remove.stream().map(rect -> inverseCTM.createTransformedShape(rect).getBounds2D()).forEach(box -> {
|
||||
for (int y = (int) Math.floor(box.getMinY() / rescale); y <= (int) Math.min(Math.ceil(box.getMaxY() / rescale), h); y++) {
|
||||
for (int x = (int) Math.floor(box.getMinX() / rescale); x <= (int) Math.min(Math.ceil(box.getMaxX() / rescale), w); x++) {
|
||||
pixels[w * y + x] = grayScaleTresh;
|
||||
}
|
||||
}
|
||||
});
|
||||
remove.stream()
|
||||
.map(rect -> RectangleTransformations.transform(rect, inverseCTM))
|
||||
.forEach(box -> {
|
||||
for (int y = (int) Math.floor(box.getMinY() / rescale); y <= (int) Math.min(Math.ceil(box.getMaxY() / rescale), h); y++) {
|
||||
for (int x = (int) Math.floor(box.getMinX() / rescale); x <= (int) Math.min(Math.ceil(box.getMaxX() / rescale), w); x++) {
|
||||
pixels[w * y + x] = grayScaleTresh;
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
// var image2 = createImageFromMatrix(pixels, w, h);
|
||||
|
||||
@ -130,8 +134,10 @@ public class FindGraphicsRaster {
|
||||
}
|
||||
}
|
||||
}
|
||||
return boundingBoxes.stream().filter(box -> box.area() > 0).map(box -> box.transform(imageCTM)).collect(Collectors.toList());
|
||||
return boundingBoxes.stream()
|
||||
.filter(box -> box.area() > 0)
|
||||
.map(box -> box.transform(imageCTM))
|
||||
.collect(Collectors.toList());
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
||||
@ -4,15 +4,14 @@ import java.awt.geom.Rectangle2D;
|
||||
import java.util.List;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import org.apache.pdfbox.pdmodel.PDDocument;
|
||||
import org.apache.pdfbox.pdmodel.PDPage;
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import com.iqser.red.service.redaction.v1.server.model.document.nodes.ImageType;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.BoundingBox;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.image.ClassifiedImage;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.Word;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.PageInformation;
|
||||
|
||||
import lombok.RequiredArgsConstructor;
|
||||
import lombok.SneakyThrows;
|
||||
@ -25,32 +24,13 @@ public class GraphicExtractorService {
|
||||
private static final int MIN_GRAPHICS_AREA = 500;
|
||||
|
||||
private final GraphicsClusteringService graphicsClusteringService;
|
||||
private final FindGraphicsRaster findGraphicsRaster;
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
public List<Box> extractPathElementGraphics(PDDocument pdDocument,
|
||||
PDPage pdPage,
|
||||
int pageNumber,
|
||||
CleanRulings cleanRulings,
|
||||
List<Word> words,
|
||||
boolean graphicsRaster) {
|
||||
public List<ClassifiedImage> extractPathElementGraphics(List<Box> graphicBBoxes, int pageNumber, CleanRulings cleanRulings) {
|
||||
|
||||
List<Box> characterBBoxes = getCharacterBBoxes(words);
|
||||
List<Box> classifiedRulingsBoxes = getLineBBoxesOfAllClassifiedRulings(cleanRulings);
|
||||
|
||||
GraphicBBDetector graphicBBDetector = new GraphicBBDetector(pdPage, true);
|
||||
List<Box> graphicBBoxes = graphicBBDetector.findGraphicBB();
|
||||
|
||||
if (graphicsRaster) {
|
||||
// This should only be used if ocr was performed, it is currently in an early stage and needs to be improved.
|
||||
graphicBBoxes.addAll(findGraphicsRaster.findCCBoundingBoxes(pdDocument,
|
||||
characterBBoxes.stream()
|
||||
.map(box -> new Rectangle2D.Double(box.x1 - 2, box.y1 - 2, box.width() + 4, box.height() + 4))
|
||||
.collect(Collectors.toList()),
|
||||
PageInformation.fromPDPage(pageNumber, pdPage)));
|
||||
}
|
||||
|
||||
List<Box> filteredGraphicBBoxes = graphicBBoxes.stream()
|
||||
.filter(box -> !box.intersectsAny(classifiedRulingsBoxes, 4))
|
||||
.collect(Collectors.toList());
|
||||
@ -59,19 +39,11 @@ public class GraphicExtractorService {
|
||||
|
||||
return clusters.stream()
|
||||
.filter(box -> box.area() > MIN_GRAPHICS_AREA && box.height() > MIN_GRAPHICS_SIDE_LENGTH && box.width() > MIN_GRAPHICS_SIDE_LENGTH)
|
||||
.map(g -> new ClassifiedImage(new Rectangle2D.Double(g.x1, g.y1, g.width(), g.height()), ImageType.GRAPHIC, false, pageNumber, ""))
|
||||
.toList();
|
||||
}
|
||||
|
||||
|
||||
private List<Box> getCharacterBBoxes(List<Word> words) {
|
||||
|
||||
return words.stream()
|
||||
.map(BoundingBox::getBBoxPdf)
|
||||
.map(Box::new)
|
||||
.collect(Collectors.toList());
|
||||
}
|
||||
|
||||
|
||||
private List<Box> getLineBBoxesOfAllClassifiedRulings(CleanRulings cleanRulings) {
|
||||
|
||||
return cleanRulings.buildAll()
|
||||
|
||||
@ -14,7 +14,7 @@ import com.iqser.red.service.redaction.v1.server.model.document.nodes.Section;
|
||||
import com.iqser.red.service.redaction.v1.server.model.document.nodes.SemanticNode;
|
||||
import com.iqser.red.service.redaction.v1.server.model.document.nodes.SuperSection;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.CoordinateTransforms;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.PageInformation;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.PageInformation;
|
||||
import com.knecon.fforesight.service.viewerdoc.model.Outline;
|
||||
|
||||
import lombok.SneakyThrows;
|
||||
|
||||
@ -199,11 +199,18 @@ public class PDFLinesTextStripper extends PDFTextStripper {
|
||||
}
|
||||
|
||||
|
||||
private void addVisibleRulings(List<Ruling> path, boolean stroke) throws IOException {
|
||||
private void addVisibleRulings(List<Ruling> path, boolean stroke) {
|
||||
|
||||
try {
|
||||
if (stroke && !getGraphicsState().getStrokingColor().isPattern() && isBlack(getGraphicsState().getStrokingColor()) || //
|
||||
!stroke && !getGraphicsState().getNonStrokingColor().isPattern() && isBlack(getGraphicsState().getNonStrokingColor())) {
|
||||
// see spec '8.4.3.6 Line dash pattern'
|
||||
var dashPattern = getGraphicsState().getLineDashPattern();
|
||||
if (dashPattern != null && dashPattern.getDashArray().length > 0) {
|
||||
path.forEach(r -> r.setStyle(Ruling.Style.DASHED));
|
||||
} else {
|
||||
path.forEach(r -> r.setStyle(Ruling.Style.SOLID));
|
||||
}
|
||||
rulings.addAll(path);
|
||||
}
|
||||
} catch (UnsupportedOperationException e) {
|
||||
@ -247,9 +254,7 @@ public class PDFLinesTextStripper extends PDFTextStripper {
|
||||
}
|
||||
|
||||
if (!words.isEmpty()) {
|
||||
previous = words.get(words.size() - 1)
|
||||
.getCharacters()
|
||||
.get(words.get(words.size() - 1).getCharacters().size() - 1).getTextPosition();
|
||||
previous = words.get(words.size() - 1).getCharacters().get(words.get(words.size() - 1).getCharacters().size() - 1).getTextPosition();
|
||||
}
|
||||
|
||||
if (i == 0 && (textPositions.get(i).getUnicode().equals(" ") || textPositions.get(i).getUnicode().equals("\u00A0") || textPositions.get(i).getUnicode().equals("\t"))) {
|
||||
|
||||
@ -0,0 +1,138 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.services.tables;
|
||||
|
||||
import java.awt.geom.AffineTransform;
|
||||
import java.awt.geom.Point2D;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collection;
|
||||
import java.util.Comparator;
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.Optional;
|
||||
import java.util.Set;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.UnionFind;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.Cell;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations;
|
||||
|
||||
import lombok.experimental.UtilityClass;
|
||||
|
||||
@UtilityClass
|
||||
public class AreaSweepGridifier {
|
||||
|
||||
public static final double CELL_AREA_CONTAINED_THRESHOLD = 0.8;
|
||||
public static final double MIN_SIZE_FACTOR = 0.5;
|
||||
|
||||
|
||||
/**
|
||||
* Calculates the grid structure of the table. For spanning rows and columns multiple cells with the same values will be inserted.
|
||||
* Works well for perfectly straight tables, but fails as soon as the tables are slightly rotated. Then the area sweep will drop some cells or duplicate them unnecessarily.
|
||||
*
|
||||
* @return TablePageBlock Structure as a rows of cells matrix
|
||||
*/
|
||||
public List<List<Cell>> gridify(Collection<Cell> cells, AffineTransform pageToPdfTransform, double minCellWidth, double minCellHeight) {
|
||||
|
||||
if (cells.isEmpty()) {
|
||||
return new ArrayList<>();
|
||||
}
|
||||
|
||||
var colDividers = getColDividers(cells, minCellWidth);
|
||||
var rowDividers = getRowDividers(cells, minCellHeight);
|
||||
|
||||
List<List<Cell>> rowsOfCells = new ArrayList<>();
|
||||
|
||||
for (int i = 1; i < rowDividers.size(); i++) {
|
||||
double prevY = rowDividers.get(i - 1);
|
||||
double y = rowDividers.get(i);
|
||||
|
||||
List<Cell> row = new ArrayList<>();
|
||||
|
||||
for (int j = 1; j < colDividers.size(); j++) {
|
||||
double prevX = colDividers.get(j - 1);
|
||||
double x = colDividers.get(j);
|
||||
|
||||
var cellFromGridStructure = Cell.fromPageCoordinates(new Point2D.Double(prevX, prevY), new Point2D.Double(x, y), pageToPdfTransform);
|
||||
|
||||
if (!cellFromGridStructure.hasMinimumSize()) {
|
||||
continue;
|
||||
}
|
||||
|
||||
Optional<Cell> matchingCell = cells.stream()
|
||||
.map(originalCell -> new CellWithIntersection(originalCell,
|
||||
RectangleTransformations.calculateIntersectedArea(cellFromGridStructure.getBBox(), originalCell.getBBox())))
|
||||
.filter(cellWithIntersection -> cellWithIntersection.intersectedArea() > 0)
|
||||
.filter(cellWithIntersection -> cellFromGridStructure.getArea() > cellWithIntersection.intersectedArea * CELL_AREA_CONTAINED_THRESHOLD)
|
||||
.max(Comparator.comparing(CellWithIntersection::intersectedArea))
|
||||
.map(CellWithIntersection::originalCell);
|
||||
|
||||
if (matchingCell.isPresent()) {
|
||||
cellFromGridStructure.getTextBlocks().addAll(matchingCell.get().getTextBlocks());
|
||||
cellFromGridStructure.setHeaderCell(matchingCell.get().isHeaderCell());
|
||||
}
|
||||
|
||||
row.add(cellFromGridStructure);
|
||||
|
||||
}
|
||||
|
||||
rowsOfCells.add(row);
|
||||
}
|
||||
|
||||
return rowsOfCells;
|
||||
}
|
||||
|
||||
|
||||
private List<Double> getRowDividers(Collection<Cell> cells, double minCellHeight) {
|
||||
|
||||
Set<Double> uniqueY = new HashSet<>();
|
||||
cells.stream()
|
||||
.filter(c -> !c.getTextBlocks().isEmpty() || c.getHeight() > 3 && c.getWidth() > 3)
|
||||
.forEach(c -> {
|
||||
uniqueY.add(c.getMinY());
|
||||
uniqueY.add(c.getMaxY());
|
||||
});
|
||||
|
||||
return deduplicate(uniqueY, minCellHeight * MIN_SIZE_FACTOR);
|
||||
}
|
||||
|
||||
|
||||
private List<Double> getColDividers(Collection<Cell> cells, double minCellWidth) {
|
||||
|
||||
Set<Double> uniqueX = new HashSet<>();
|
||||
cells.stream()
|
||||
.filter(c -> !c.getTextBlocks().isEmpty() || c.getHeight() > 3 && c.getWidth() > 3)
|
||||
.forEach(c -> {
|
||||
uniqueX.add(c.getMinX());
|
||||
uniqueX.add(c.getMaxX());
|
||||
});
|
||||
|
||||
return deduplicate(uniqueX, minCellWidth * MIN_SIZE_FACTOR);
|
||||
}
|
||||
|
||||
|
||||
private List<Double> deduplicate(Set<Double> doubles, double minDistance) {
|
||||
// finds all doubles less than the minDistance apart and replaces them with their average
|
||||
UnionFind<Double> uf = new UnionFind<>(doubles);
|
||||
for (Double x : doubles) {
|
||||
for (Double x2 : doubles) {
|
||||
if (x.equals(x2)) {
|
||||
continue;
|
||||
}
|
||||
if (Math.abs(x - x2) < minDistance) {
|
||||
uf.union(x, x2);
|
||||
}
|
||||
}
|
||||
}
|
||||
return uf.getGroups()
|
||||
.stream()
|
||||
.map(xs -> xs.stream()
|
||||
.mapToDouble(Double::doubleValue).average()
|
||||
.orElseThrow())
|
||||
.sorted()
|
||||
.toList();
|
||||
}
|
||||
|
||||
|
||||
record CellWithIntersection(Cell originalCell, double intersectedArea) {
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
@ -0,0 +1,257 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.services.tables;
|
||||
|
||||
import java.awt.geom.AffineTransform;
|
||||
import java.awt.geom.Line2D;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collection;
|
||||
import java.util.Collections;
|
||||
import java.util.HashSet;
|
||||
import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
import java.util.stream.Collectors;
|
||||
import java.util.stream.Stream;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.Cell;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.LinkedQuadPointCell;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.QuadPoint;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.Word;
|
||||
|
||||
import lombok.SneakyThrows;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
|
||||
@Slf4j
|
||||
public class QuadPointGridifier {
|
||||
|
||||
public static final int MAX_SPLITTING_ITERATIONS = 10;
|
||||
Set<LinkedQuadPointCell> cells;
|
||||
AffineTransform pageToPdfTransform;
|
||||
AffineTransform pdfToPageTransform;
|
||||
double minCellHeight;
|
||||
double minCellWidth;
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
QuadPointGridifier(Collection<LinkedQuadPointCell> cells, AffineTransform pdfToPageTransform) {
|
||||
|
||||
this.cells = new HashSet<>(cells);
|
||||
this.pageToPdfTransform = pdfToPageTransform.createInverse();
|
||||
this.pdfToPageTransform = pdfToPageTransform;
|
||||
this.minCellHeight = cells.stream()
|
||||
.map(LinkedQuadPointCell::getQuadPoint)
|
||||
.flatMap(this::verticalLines)
|
||||
.mapToDouble(QuadPointGridifier::length)
|
||||
.min().orElse(0) * 0.75;
|
||||
this.minCellWidth = cells.stream()
|
||||
.map(LinkedQuadPointCell::getQuadPoint)
|
||||
.flatMap(this::horizontalLines)
|
||||
.mapToDouble(QuadPointGridifier::length)
|
||||
.min().orElse(0) * 0.75;
|
||||
}
|
||||
|
||||
|
||||
public Stream<Line2D> horizontalLines(QuadPoint quadPoint) {
|
||||
|
||||
return Stream.of(quadPoint.getBottomLine(), quadPoint.getTopLine());
|
||||
}
|
||||
|
||||
|
||||
public Stream<Line2D> verticalLines(QuadPoint quadPoint) {
|
||||
|
||||
return Stream.of(quadPoint.getRightLine(), quadPoint.getLeftLine());
|
||||
}
|
||||
|
||||
|
||||
public static double length(Line2D line) {
|
||||
|
||||
double xAbs = Math.abs(line.getX1() - line.getX2());
|
||||
double yAbs = Math.abs(line.getY1() - line.getY2());
|
||||
return Math.sqrt(xAbs * xAbs + yAbs * yAbs);
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Calculates the grid structure of the table. For spanning rows and columns multiple cells with the same values will be inserted.
|
||||
* Checks if any cell has more than one neighbor in any direction, if it does, it splits the cell according to its neighbors.
|
||||
* This is repeated until no more splits are necessary. Then the rows are computed using that very same linked neighbor structure starting with the top left cell.
|
||||
*
|
||||
* @return TablePageBlock Structure as a rows of cells matrix
|
||||
*/
|
||||
public List<List<Cell>> gridify() {
|
||||
|
||||
var linkedCells = cells.stream()
|
||||
.toList();
|
||||
|
||||
computeNeighbours(linkedCells);
|
||||
int splits = 0;
|
||||
while (linkedCells.stream()
|
||||
.anyMatch(LinkedQuadPointCell::needsSplit) && splits < MAX_SPLITTING_ITERATIONS) {
|
||||
|
||||
List<LinkedQuadPointCell> newCells = new LinkedList<>();
|
||||
for (LinkedQuadPointCell linkedCell : linkedCells) {
|
||||
if (linkedCell.needsSplit()) {
|
||||
newCells.addAll(linkedCell.split(minCellWidth, minCellHeight));
|
||||
} else {
|
||||
newCells.add(linkedCell);
|
||||
}
|
||||
}
|
||||
computeNeighbours(newCells);
|
||||
linkedCells = newCells;
|
||||
splits++;
|
||||
}
|
||||
|
||||
return buildStructure(linkedCells);
|
||||
}
|
||||
|
||||
|
||||
private List<List<Cell>> buildStructure(List<LinkedQuadPointCell> cells) {
|
||||
|
||||
if (cells.isEmpty()) {
|
||||
return Collections.emptyList();
|
||||
}
|
||||
List<List<LinkedQuadPointCell>> rows = buildRows(cells);
|
||||
List<List<Cell>> cellRows = mapToCells(rows);
|
||||
if (isNotRectangular(rows)) {
|
||||
log.error("Non rectangular table on page {}",
|
||||
cells.stream()
|
||||
.map(LinkedQuadPointCell::getPageBlocks)
|
||||
.flatMap(List::stream)
|
||||
.map(AbstractPageBlock::getWords)
|
||||
.flatMap(Collection::stream)
|
||||
.map(Word::getPage)
|
||||
.findAny().orElse(0));
|
||||
// sometimes this algorithm fails to produce a rectangular table, this happens when the lines are so tilted it eventually produces a cell which is skipped due to being too small, leading to non-rectangular rows.
|
||||
// Then we use the area sweep algorithm as a fallback.
|
||||
return AreaSweepGridifier.gridify(this.cells.stream()
|
||||
.map(this::toCell)
|
||||
.toList(), pageToPdfTransform, minCellWidth, minCellHeight);
|
||||
}
|
||||
cellRows = removeEmptyRows(cellRows);
|
||||
cellRows = removeEmptyCols(cellRows);
|
||||
return cellRows;
|
||||
}
|
||||
|
||||
|
||||
private List<List<Cell>> mapToCells(List<List<LinkedQuadPointCell>> rows) {
|
||||
|
||||
return rows.stream()
|
||||
.map(row -> row.stream()
|
||||
.map(this::toCell)
|
||||
.toList())
|
||||
.toList();
|
||||
}
|
||||
|
||||
|
||||
private Cell toCell(LinkedQuadPointCell qpCell) {
|
||||
|
||||
Cell cell = Cell.fromPageCoordinates(qpCell.getQuadPoint().getBounds2D(), pageToPdfTransform);
|
||||
cell.setTextBlocks(qpCell.getPageBlocks());
|
||||
cell.setHeaderCell(qpCell.isHeaderCell());
|
||||
return cell;
|
||||
}
|
||||
|
||||
|
||||
private boolean isNotRectangular(List<List<LinkedQuadPointCell>> rows) {
|
||||
|
||||
if (rows.isEmpty()) {
|
||||
return true;
|
||||
}
|
||||
int n = rows.get(0).size();
|
||||
return rows.stream()
|
||||
.anyMatch(row -> row.size() != n);
|
||||
}
|
||||
|
||||
|
||||
private List<List<LinkedQuadPointCell>> buildRows(List<LinkedQuadPointCell> cells) {
|
||||
|
||||
List<LinkedQuadPointCell> topLeftCandidates = cells.stream()
|
||||
.filter(LinkedQuadPointCell::isTopLeft)
|
||||
.toList();
|
||||
|
||||
if (topLeftCandidates.size() != 1) {
|
||||
log.error("More than one top-left cell found!");
|
||||
}
|
||||
var cell = topLeftCandidates.get(0);
|
||||
|
||||
List<List<LinkedQuadPointCell>> rows = new ArrayList<>();
|
||||
rows.add(buildRow(cell));
|
||||
while (!cell.getBelows().isEmpty()) {
|
||||
cell = cell.getBelows().get(0);
|
||||
rows.add(buildRow(cell));
|
||||
}
|
||||
return rows;
|
||||
}
|
||||
|
||||
|
||||
private static List<LinkedQuadPointCell> buildRow(LinkedQuadPointCell cell) {
|
||||
|
||||
List<LinkedQuadPointCell> currentRow = new ArrayList<>();
|
||||
LinkedQuadPointCell nextCell = cell;
|
||||
currentRow.add(cell);
|
||||
while (!nextCell.getRights().isEmpty()) {
|
||||
nextCell = nextCell.getRights().get(0);
|
||||
currentRow.add(nextCell);
|
||||
}
|
||||
return currentRow;
|
||||
}
|
||||
|
||||
|
||||
private void computeNeighbours(List<LinkedQuadPointCell> cells) {
|
||||
|
||||
for (LinkedQuadPointCell cell : cells) {
|
||||
cell.resetNeighbours();
|
||||
computeNeighbours(cell, cells);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
private void computeNeighbours(LinkedQuadPointCell cell, List<LinkedQuadPointCell> otherCells) {
|
||||
|
||||
for (LinkedQuadPointCell otherCell : otherCells) {
|
||||
if (cell.equals(otherCell)) {
|
||||
continue;
|
||||
}
|
||||
cell.addToNeighbours(otherCell, minCellWidth, minCellHeight);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
static <T> List<List<T>> transpose(List<List<T>> table) {
|
||||
|
||||
List<List<T>> ret = new ArrayList<List<T>>();
|
||||
final int N = table.get(0).size();
|
||||
for (int i = 0; i < N; i++) {
|
||||
List<T> col = new ArrayList<T>();
|
||||
for (List<T> row : table) {
|
||||
col.add(row.get(i));
|
||||
}
|
||||
ret.add(col);
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
|
||||
private List<List<Cell>> removeEmptyCols(List<List<Cell>> rowsOfCells) {
|
||||
|
||||
if (rowsOfCells.isEmpty()) {
|
||||
return rowsOfCells;
|
||||
}
|
||||
|
||||
var colsOfCells = transpose(rowsOfCells);
|
||||
colsOfCells = removeEmptyRows(colsOfCells);
|
||||
return transpose(colsOfCells);
|
||||
}
|
||||
|
||||
|
||||
private List<List<Cell>> removeEmptyRows(List<List<Cell>> rowsOfCells) {
|
||||
|
||||
return rowsOfCells.stream()
|
||||
.filter(row -> row.stream()
|
||||
.anyMatch(cell -> !cell.getTextBlocks().isEmpty()))
|
||||
.collect(Collectors.toList());
|
||||
}
|
||||
|
||||
}
|
||||
@ -1,4 +1,4 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.utils;
|
||||
package com.knecon.fforesight.service.layoutparser.processor.services.tables;
|
||||
|
||||
import static com.knecon.fforesight.service.layoutparser.processor.utils.GeometricComparators.Y_FIRST_POINT_COMPARATOR;
|
||||
|
||||
@ -14,15 +14,6 @@ public class RectangularIntersectionFinder {
|
||||
|
||||
public static List<Rectangle2D> find(List<Ruling> horizontalRulingLines, List<Ruling> verticalRulingLines) {
|
||||
|
||||
// // Fix for 211.pdf
|
||||
// for (Ruling r : horizontalRulingLines) {
|
||||
// if (r.getX2() < r.getX1()) {
|
||||
// double a = r.getX2();
|
||||
// r.x2 = (float) r.getX1();
|
||||
// r.x1 = (float) a;
|
||||
// }
|
||||
// }
|
||||
|
||||
List<Rectangle2D> foundRectangles = new ArrayList<>();
|
||||
Map<Point2D, RulingIntersectionFinder.IntersectingRulings> intersectionPoints = RulingIntersectionFinder.findNaive(horizontalRulingLines, verticalRulingLines);
|
||||
|
||||
@ -1,4 +1,4 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.utils;
|
||||
package com.knecon.fforesight.service.layoutparser.processor.services.tables;
|
||||
|
||||
import java.awt.geom.Point2D;
|
||||
import java.util.Collections;
|
||||
@ -10,6 +10,7 @@ import java.util.Optional;
|
||||
import java.util.TreeMap;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.DoubleComparisons;
|
||||
|
||||
import lombok.experimental.UtilityClass;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
@ -33,7 +34,7 @@ public class RulingIntersectionFinder {
|
||||
*/
|
||||
/*
|
||||
* The algorithm assumes there are only horizontal and vertical lines which are unique in their coordinates. (E.g. no overlapping horizontal lines exist)
|
||||
* As a high level overview, the algorithm uses a sweep line advancing from left to right.
|
||||
* As a high level overview, the algorithm uses a sweep line advancing from lefts to rights.
|
||||
* It dynamically updates the horizontal rulings which are intersected by the current sweep line.
|
||||
* When the sweep line hits a vertical line, it then checks for all intersections with the currently intersected horizontal rulings.
|
||||
* THe trick of the algorithm is using a binary search tree to store the currently intersected horizontal rulings. This way the lookup should be in O(log n).
|
||||
@ -1,4 +1,4 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.utils;
|
||||
package com.knecon.fforesight.service.layoutparser.processor.services.tables;
|
||||
|
||||
import java.awt.geom.Line2D;
|
||||
import java.awt.geom.Point2D;
|
||||
@ -12,7 +12,7 @@ public final class RulingTextDirAdjustUtil {
|
||||
|
||||
/**
|
||||
* Converts a ruling (line of a table) the same way TextPositions are converted in PDFBox.
|
||||
* This will get the y position of the text, adjusted so that 0,0 is upper left and it is adjusted based on the text direction.
|
||||
* This will get the y position of the text, adjusted so that 0,0 is upper lefts and it is adjusted based on the text direction.
|
||||
* <p>
|
||||
* See org.apache.pdfbox.text.TextPosition
|
||||
*/
|
||||
@ -0,0 +1,109 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.services.tables;
|
||||
|
||||
import java.awt.geom.AffineTransform;
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.BoundingBox;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.Cell;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations;
|
||||
|
||||
import lombok.SneakyThrows;
|
||||
import lombok.experimental.UtilityClass;
|
||||
|
||||
@UtilityClass
|
||||
public class TableAreaFiller {
|
||||
|
||||
public Set<Cell> findMissingCells(List<Cell> cells, Rectangle2D areaPDF, AffineTransform pdfToPageTransform) {
|
||||
|
||||
var area = RectangleTransformations.transform(areaPDF, pdfToPageTransform);
|
||||
|
||||
List<Rectangle2D> rectangles = cells.stream()
|
||||
.map(BoundingBox::getBBox)
|
||||
.toList();
|
||||
Set<Rectangle2D> unfilledRects = findMissingRects(rectangles, area);
|
||||
|
||||
AffineTransform pageToPdfTransform = getInverse(pdfToPageTransform);
|
||||
|
||||
return unfilledRects.stream()
|
||||
.map(rect -> Cell.fromPageCoordinates(rect, pageToPdfTransform))
|
||||
.collect(Collectors.toSet());
|
||||
}
|
||||
|
||||
|
||||
public static Set<Rectangle2D> findMissingRects(List<Rectangle2D> rectangles, Rectangle2D area) {
|
||||
|
||||
double minWidth = rectangles.stream()
|
||||
.mapToDouble(Rectangle2D::getWidth)
|
||||
.min().orElse(0) * 0.95;
|
||||
double minHeight = rectangles.stream()
|
||||
.mapToDouble(Rectangle2D::getHeight)
|
||||
.min().orElse(0) * 0.95;
|
||||
|
||||
Set<Rectangle2D> unfilledRects = new HashSet<>();
|
||||
unfilledRects.add(area);
|
||||
for (Rectangle2D rectangle : rectangles) {
|
||||
unfilledRects = fillWithRectangle(unfilledRects, rectangle, minWidth, minHeight);
|
||||
}
|
||||
return unfilledRects;
|
||||
}
|
||||
|
||||
|
||||
private Set<Rectangle2D> fillWithRectangle(Set<Rectangle2D> unfilledRects, Rectangle2D rectToAdd, double minWidth, double minHeight) {
|
||||
|
||||
Set<Rectangle2D> remainingUnfilledRects = new HashSet<>();
|
||||
for (Rectangle2D unfilledRect : unfilledRects) {
|
||||
if (!rectToAdd.intersects(unfilledRect)) {
|
||||
remainingUnfilledRects.add(unfilledRect);
|
||||
continue;
|
||||
}
|
||||
|
||||
boolean topAdded = false;
|
||||
boolean bottomAdded = false;
|
||||
|
||||
// Top rectangle
|
||||
double topHeight = rectToAdd.getY() - unfilledRect.getY();
|
||||
if (topHeight > minHeight) {
|
||||
topAdded = true;
|
||||
Rectangle2D topRect = new Rectangle2D.Double(unfilledRect.getX(), unfilledRect.getY(), unfilledRect.getWidth(), topHeight);
|
||||
remainingUnfilledRects.add(topRect);
|
||||
}
|
||||
// Bottom rectangle
|
||||
double bottomHeight = unfilledRect.getMaxY() - rectToAdd.getMaxY();
|
||||
if (bottomHeight > minHeight) {
|
||||
bottomAdded = true;
|
||||
Rectangle2D bottomRect = new Rectangle2D.Double(unfilledRect.getX(), rectToAdd.getMaxY(), unfilledRect.getWidth(), bottomHeight);
|
||||
remainingUnfilledRects.add(bottomRect);
|
||||
}
|
||||
|
||||
double y = topAdded ? rectToAdd.getY() : unfilledRect.getY();
|
||||
double maxY = bottomAdded ? rectToAdd.getMaxY() : unfilledRect.getMaxY();
|
||||
double height = maxY - y;
|
||||
|
||||
// Left rectangle
|
||||
double leftWidth = rectToAdd.getX() - unfilledRect.getX();
|
||||
if (leftWidth > minWidth) {
|
||||
Rectangle2D leftRect = new Rectangle2D.Double(unfilledRect.getX(), y, leftWidth, height);
|
||||
remainingUnfilledRects.add(leftRect);
|
||||
}
|
||||
// Right rectangle
|
||||
double rightWidth = unfilledRect.getMaxX() - rectToAdd.getMaxX();
|
||||
if (rightWidth > minWidth) {
|
||||
Rectangle2D rightRect = new Rectangle2D.Double(rectToAdd.getMaxX(), y, rightWidth, height);
|
||||
remainingUnfilledRects.add(rightRect);
|
||||
}
|
||||
}
|
||||
return remainingUnfilledRects;
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
private static AffineTransform getInverse(AffineTransform pdfToPageTransform) {
|
||||
|
||||
return pdfToPageTransform.createInverse();
|
||||
}
|
||||
|
||||
}
|
||||
@ -0,0 +1,272 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.services.tables;
|
||||
|
||||
import static com.knecon.fforesight.service.layoutparser.processor.utils.GeometricComparators.CELL_SIZE_COMPARATOR;
|
||||
import static com.knecon.fforesight.service.layoutparser.processor.utils.GeometricComparators.RECTANGLE_SIZE_COMPARATOR;
|
||||
|
||||
import java.awt.Color;
|
||||
import java.awt.geom.AffineTransform;
|
||||
import java.awt.geom.Point2D;
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collection;
|
||||
import java.util.Collections;
|
||||
import java.util.HashSet;
|
||||
import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Objects;
|
||||
import java.util.Set;
|
||||
import java.util.function.Function;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.BoundingBox;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.docstrum.service.ReadingOrderService;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.PageInformation;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.Cell;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.LinkedQuadPointCell;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.QuadPoint;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.TablePageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.Word;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.blockification.BlockificationService;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.CoordinateTransforms;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.DoubleComparisons;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.GeometricComparators;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.SpreadsheetFinder;
|
||||
import com.knecon.fforesight.service.ocr.v1.api.model.Table;
|
||||
import com.knecon.fforesight.service.ocr.v1.api.model.TableCell;
|
||||
import com.knecon.fforesight.service.ocr.v1.api.model.TableCellType;
|
||||
|
||||
import lombok.AccessLevel;
|
||||
import lombok.RequiredArgsConstructor;
|
||||
import lombok.SneakyThrows;
|
||||
import lombok.experimental.FieldDefaults;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
|
||||
@Slf4j
|
||||
@Service
|
||||
@RequiredArgsConstructor
|
||||
@FieldDefaults(level = AccessLevel.PRIVATE, makeFinal = true)
|
||||
public class TableExtractionService {
|
||||
|
||||
public static final int MAX_ROWS_OR_COLS = 500;
|
||||
public static final int MAX_CELLS = MAX_ROWS_OR_COLS * MAX_ROWS_OR_COLS;
|
||||
BlockificationService blockificationService;
|
||||
ReadingOrderService readingOrderService;
|
||||
static int MIN_TABLE_CONTAINED_CELLS_WITH_TEXT = 1;
|
||||
static double TABLE_UNIFORMITY_THRESHOLD = 0.7;
|
||||
|
||||
|
||||
public List<TablePageBlock> extractTables(List<Cell> emptyCells,
|
||||
List<Word> words,
|
||||
PageInformation pageInformation,
|
||||
List<Table> idpTables,
|
||||
LayoutParsingType layoutParsingType) {
|
||||
|
||||
AffineTransform pdfToPageTransform = CoordinateTransforms.calculateInitialUserSpaceCoordsToPageCoords(pageInformation);
|
||||
List<TablePageBlock> tablePageBlocks;
|
||||
if (idpTables == null || idpTables.isEmpty()) {
|
||||
tablePageBlocks = extractTables(emptyCells, words, pdfToPageTransform, layoutParsingType);
|
||||
} else {
|
||||
tablePageBlocks = buildTableFromIdpResult(idpTables, words, pdfToPageTransform, layoutParsingType);
|
||||
}
|
||||
return tablePageBlocks;
|
||||
}
|
||||
|
||||
|
||||
private List<TablePageBlock> extractTables(List<Cell> emptyCells,
|
||||
List<Word> words,
|
||||
AffineTransform pdfToPageTransform,
|
||||
LayoutParsingType layoutParsingType) {
|
||||
|
||||
// sort cells by size (height * width) ascending so that textBlocks are always assigned to the smallest cells that contain them
|
||||
emptyCells.sort(CELL_SIZE_COMPARATOR);
|
||||
|
||||
List<Cell> cells = new ArrayList<>(new HashSet<>(emptyCells));
|
||||
DoubleComparisons.sort(cells, GeometricComparators.CELL_SORTER);
|
||||
|
||||
List<Rectangle2D> spreadsheetAreas = SpreadsheetFinder.findSpreadsheetsFromCells(cells);
|
||||
// sort spreadsheetAreas by size (height * width) ascending so that cells are placed in the smallest tables first
|
||||
// this way no cell duplication occurs when tables are contained in other tables and only the most inner table contains the cells
|
||||
spreadsheetAreas.sort(RECTANGLE_SIZE_COMPARATOR);
|
||||
|
||||
List<TablePageBlock> tables = new ArrayList<>();
|
||||
for (Rectangle2D area : spreadsheetAreas) {
|
||||
|
||||
List<Cell> containedCells = new ArrayList<>();
|
||||
for (Cell cell : cells) {
|
||||
if (cell.hasMinimumSize() && area.contains(cell.getBBoxPdf())) {
|
||||
containedCells.add(cell);
|
||||
}
|
||||
}
|
||||
|
||||
if (containedCells.isEmpty()) {
|
||||
continue;
|
||||
}
|
||||
// if cells are missing, for example a corner hasn't been recognized (See files/syngenta/CustomerFiles/SinglePages/T4_Page16_138 IDD0000261736.pdf),
|
||||
// the LinkedCell based gridification can deal with this, but the transpose logic will then drop the entire column.
|
||||
// That's why we compute the missing Cells from the spreadsheet area and fill them in.
|
||||
Set<Cell> missingCells = TableAreaFiller.findMissingCells(containedCells, area, pdfToPageTransform);
|
||||
containedCells.addAll(missingCells);
|
||||
|
||||
Set<Word> wordsInTable = new HashSet<>(); // As docstrum blockfication recomputes the words, we need to remember the origin words to remove them from the overall list of words
|
||||
for (Cell cell : containedCells) {
|
||||
Function<Point2D, Boolean> contains = p -> cell.getBBoxPdf().contains(p);
|
||||
Function<Rectangle2D, Boolean> containsRect = r -> cell.getBBoxPdf().contains(r);
|
||||
BlocksWithTheirWords blocksWithTheirWords = sortBlocksIntoCell(layoutParsingType, words, tables, contains, containsRect);
|
||||
cell.setTextBlocks(blocksWithTheirWords.blocks());
|
||||
wordsInTable.addAll(blocksWithTheirWords.words());
|
||||
}
|
||||
|
||||
if (containedCells.size() > MAX_CELLS) {
|
||||
continue;
|
||||
}
|
||||
|
||||
var containedCellsWithText = containedCells.stream()
|
||||
.filter(cell -> !cell.getTextBlocks().isEmpty())
|
||||
.toList();
|
||||
|
||||
// verify if table would contain fewer cells with text than the threshold allows
|
||||
if (containedCellsWithText.size() >= MIN_TABLE_CONTAINED_CELLS_WITH_TEXT && checkIfTableCellsAreUniform(containedCells)) {
|
||||
|
||||
TablePageBlock tablePageBlock = new TableFromCellsExtractor(containedCells, pdfToPageTransform).extract();
|
||||
cells.removeAll(containedCells);
|
||||
addTableIfValid(words, tablePageBlock, tables, wordsInTable);
|
||||
}
|
||||
}
|
||||
|
||||
return tables;
|
||||
}
|
||||
|
||||
|
||||
private static void removeWordsFromCells(List<Word> words, TablePageBlock tablePageBlock) {
|
||||
|
||||
Set<Word> wordsFromCells = new HashSet<>(tablePageBlock.getWords());
|
||||
words.removeAll(wordsFromCells);
|
||||
}
|
||||
|
||||
|
||||
private List<TablePageBlock> buildTableFromIdpResult(List<Table> idpTables, List<Word> words, AffineTransform pdfToPageTransform, LayoutParsingType layoutParsingType) {
|
||||
|
||||
if (idpTables == null || idpTables.isEmpty()) {
|
||||
return Collections.emptyList();
|
||||
}
|
||||
List<TablePageBlock> tables = new ArrayList<>();
|
||||
for (Table idpTable : idpTables) {
|
||||
if (idpTable.bboxes().size() != 1) {
|
||||
// Should never happen, as IDP still looks at pages individually. (I think so, at least 😅)
|
||||
log.error("IDP Table on multiple pages are not handled yet!");
|
||||
continue;
|
||||
}
|
||||
|
||||
List<LinkedQuadPointCell> qpCells = new ArrayList<>(idpTable.cells().size());
|
||||
Set<Word> wordsInTable = new HashSet<>(); // As docstrum blockfication recomputes the words, we need to remember the origin words to remove them from the overall list of words
|
||||
for (TableCell idpCell : idpTable.cells()) {
|
||||
BlocksWithTheirWords blocksWithTheirWords = sortWordsIntoQuadPoint(words, layoutParsingType, idpCell, tables);
|
||||
wordsInTable.addAll(blocksWithTheirWords.words());
|
||||
|
||||
LinkedQuadPointCell qpCell = new LinkedQuadPointCell(QuadPoint.fromData(idpCell.textRegion().region().bbox()).getTransformed(pdfToPageTransform),
|
||||
blocksWithTheirWords.blocks);
|
||||
if (idpCell.kind().equals(TableCellType.ROW_HEADER) || idpCell.kind().equals(TableCellType.COLUMN_HEADER)) {
|
||||
qpCell.setHeaderCell(true);
|
||||
}
|
||||
|
||||
qpCells.add(qpCell);
|
||||
}
|
||||
|
||||
QuadPointGridifier calculator = new QuadPointGridifier(qpCells, pdfToPageTransform);
|
||||
List<List<Cell>> rows = calculator.gridify();
|
||||
TablePageBlock tablePageBlock = new TablePageBlock(null, rows);
|
||||
addTableIfValid(words, tablePageBlock, tables, wordsInTable);
|
||||
}
|
||||
return tables;
|
||||
}
|
||||
|
||||
|
||||
private BlocksWithTheirWords sortWordsIntoQuadPoint(List<Word> words, LayoutParsingType layoutParsingType, TableCell idpCell, List<TablePageBlock> tables) {
|
||||
|
||||
Function<Point2D, Boolean> contains = p -> idpCell.textRegion().region().bbox().get().contains(p);
|
||||
Function<Rectangle2D, Boolean> containsRect = r -> idpCell.textRegion().region().bbox().get().contains(r);
|
||||
return sortBlocksIntoCell(layoutParsingType, words, tables, contains, containsRect);
|
||||
}
|
||||
|
||||
|
||||
private static void addTableIfValid(List<Word> words, TablePageBlock tablePageBlock, List<TablePageBlock> tables, Set<Word> wordsInTable) {
|
||||
|
||||
if (tablePageBlock.getRowCount() > MAX_ROWS_OR_COLS || tablePageBlock.getColCount() == 0 || tablePageBlock.getColCount() > MAX_ROWS_OR_COLS) {
|
||||
return;
|
||||
}
|
||||
words.removeAll(wordsInTable);
|
||||
tables.add(tablePageBlock);
|
||||
}
|
||||
|
||||
|
||||
private BlocksWithTheirWords sortBlocksIntoCell(LayoutParsingType layoutParsingType,
|
||||
List<Word> words,
|
||||
List<TablePageBlock> tables,
|
||||
Function<Point2D, Boolean> contains,
|
||||
Function<Rectangle2D, Boolean> containsRect) {
|
||||
|
||||
List<Word> wordsInCell = new LinkedList<>();
|
||||
for (Word word : words) {
|
||||
Rectangle2D bBoxPdf = word.getBBoxPdf();
|
||||
if (!contains.apply(new Point2D.Double(bBoxPdf.getCenterX(), bBoxPdf.getCenterY()))) {
|
||||
continue;
|
||||
}
|
||||
wordsInCell.add(word);
|
||||
}
|
||||
List<TextPageBlock> textBlocks = blockificationService.blockify(layoutParsingType, wordsInCell, CleanRulings.empty(), null);
|
||||
List<TablePageBlock> tablesInCell = new LinkedList<>();
|
||||
for (TablePageBlock table : tables) {
|
||||
if (containsRect.apply(table.getBBoxPdf())) {
|
||||
tablesInCell.add(table);
|
||||
}
|
||||
}
|
||||
var blocks = readingOrderService.resolve(textBlocks, tablesInCell);
|
||||
return new BlocksWithTheirWords(blocks, wordsInCell);
|
||||
}
|
||||
|
||||
|
||||
private boolean checkIfTableCellsAreUniform(List<Cell> containedCells) {
|
||||
|
||||
if (containedCells.size() <= 2) {
|
||||
return true;
|
||||
}
|
||||
|
||||
Map<Long, List<Long>> cellsGroupedByRoundedWidth = containedCells.stream()
|
||||
.map(BoundingBox::getWidth)
|
||||
.map(size -> Math.round(size / 10.0) * 10)
|
||||
.collect(Collectors.groupingBy(Long::longValue));
|
||||
|
||||
return (double) cellsGroupedByRoundedWidth.size() / containedCells.size() <= TABLE_UNIFORMITY_THRESHOLD;
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
public static List<Cell> findCells(List<Ruling> horizontalRulingLines, List<Ruling> verticalRulingLines, PageInformation pageInformation) {
|
||||
|
||||
var solidHorizontalRulingLines = horizontalRulingLines.stream()
|
||||
.filter(r -> !Objects.equals(Ruling.Style.DASHED, r.getStyle()))
|
||||
.toList();
|
||||
var solidVerticalRulingLines = verticalRulingLines.stream()
|
||||
.filter(r -> !Objects.equals(Ruling.Style.DASHED, r.getStyle()))
|
||||
.toList();
|
||||
AffineTransform affineTransform = CoordinateTransforms.calculateInitialUserSpaceCoordsToPageCoords(pageInformation);
|
||||
return RectangularIntersectionFinder.find(solidHorizontalRulingLines, solidVerticalRulingLines)
|
||||
.stream()
|
||||
.map(rect -> new Cell(rect, affineTransform))
|
||||
.collect(Collectors.toList());
|
||||
}
|
||||
|
||||
|
||||
private record BlocksWithTheirWords(List<AbstractPageBlock> blocks, Collection<Word> words) {
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
@ -0,0 +1,133 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.services.tables;
|
||||
|
||||
import java.awt.geom.AffineTransform;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collection;
|
||||
import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
|
||||
import com.fasterxml.jackson.annotation.JsonIgnore;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.PageBlockType;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.Cell;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.TablePageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
|
||||
|
||||
import lombok.Getter;
|
||||
import lombok.Setter;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
|
||||
@Slf4j
|
||||
public class TableFromCellsExtractor {
|
||||
|
||||
@JsonIgnore
|
||||
protected PageBlockType classification;
|
||||
private List<List<Cell>> rows;
|
||||
@Getter
|
||||
@Setter
|
||||
private final List<Cell> originCells;
|
||||
private final AffineTransform pdfToPageTransform;
|
||||
|
||||
|
||||
public TableFromCellsExtractor(List<Cell> originCells, AffineTransform pdfToPageTransform) {
|
||||
|
||||
classification = PageBlockType.TABLE;
|
||||
this.originCells = originCells;
|
||||
this.pdfToPageTransform = pdfToPageTransform;
|
||||
}
|
||||
|
||||
|
||||
public TablePageBlock extract() {
|
||||
|
||||
computeRows(originCells);
|
||||
|
||||
computeHeaders();
|
||||
|
||||
return new TablePageBlock(null, rows);
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Detect header cells (either first row or first column):
|
||||
* Column is marked as header if originalCell text is bold and row originalCell text is not bold.
|
||||
* Defaults to row.
|
||||
*/
|
||||
private void computeHeaders() {
|
||||
|
||||
// A bold originalCell is a header originalCell as long as every originalCell to the lefts/top is bold, too
|
||||
// we move from lefts to rights and top to bottom
|
||||
for (int rowIndex = 0; rowIndex < rows.size(); rowIndex++) {
|
||||
List<Cell> rowCells = rows.get(rowIndex);
|
||||
if (rowCells.size() == 1) {
|
||||
continue;
|
||||
}
|
||||
|
||||
for (int colIndex = 0; colIndex < rowCells.size(); colIndex++) {
|
||||
Cell cell = rowCells.get(colIndex);
|
||||
List<Cell> cellsToTheLeft = rowCells.subList(0, colIndex);
|
||||
Cell lastHeaderCell = null;
|
||||
for (Cell leftCell : cellsToTheLeft) {
|
||||
if (leftCell.isHeaderCell()) {
|
||||
lastHeaderCell = leftCell;
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (lastHeaderCell != null) {
|
||||
cell.getHeaderCells().add(lastHeaderCell);
|
||||
}
|
||||
List<Cell> cellsToTheTop = new ArrayList<>();
|
||||
for (int i = 0; i < rowIndex; i++) {
|
||||
try {
|
||||
cellsToTheTop.add(rows.get(i).get(colIndex));
|
||||
} catch (IndexOutOfBoundsException e) {
|
||||
log.debug("No cell {} in row {}, ignoring.", colIndex, rowIndex);
|
||||
}
|
||||
}
|
||||
for (Cell topCell : cellsToTheTop) {
|
||||
if (topCell.isHeaderCell()) {
|
||||
lastHeaderCell = topCell;
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (lastHeaderCell != null) {
|
||||
cell.getHeaderCells().add(lastHeaderCell);
|
||||
}
|
||||
if (!cell.getTextBlocks().isEmpty() //
|
||||
&& cell.getTextBlocks().get(0) instanceof TextPageBlock textPageBlock //
|
||||
&& textPageBlock.getMostPopularWordStyle().equals("bold")) {
|
||||
cell.setHeaderCell(true);
|
||||
}
|
||||
}
|
||||
}
|
||||
setFirstRowAsHeaderIfNoneFound(rows);
|
||||
}
|
||||
|
||||
|
||||
private void setFirstRowAsHeaderIfNoneFound(List<List<Cell>> rows) {
|
||||
|
||||
if (rows.isEmpty()) {
|
||||
return;
|
||||
}
|
||||
|
||||
if (rows.stream()
|
||||
.flatMap(Collection::stream)
|
||||
.noneMatch(Cell::isHeaderCell)) {
|
||||
rows.get(0)
|
||||
.forEach(cell -> cell.setHeaderCell(true));
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
private void computeRows(List<Cell> cells) {
|
||||
|
||||
if (cells.isEmpty()) {
|
||||
return;
|
||||
}
|
||||
|
||||
TableGridStructureCalculator calculator = new TableGridStructureCalculator(cells, pdfToPageTransform);
|
||||
rows = calculator.gridify();
|
||||
}
|
||||
|
||||
}
|
||||
@ -0,0 +1,363 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.services.tables;
|
||||
|
||||
import java.awt.geom.AffineTransform;
|
||||
import java.awt.geom.Point2D;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collection;
|
||||
import java.util.Collections;
|
||||
import java.util.HashSet;
|
||||
import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.Cell;
|
||||
|
||||
import lombok.SneakyThrows;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
|
||||
@Slf4j
|
||||
public class TableGridStructureCalculator {
|
||||
|
||||
// multiplied with minimum cell height/width, Cells may be at most this apart in one dimension, and must overlap at least that much in the other dimension to be considered neighbours
|
||||
private static final double DISTANCE_FACTOR = 0.5;
|
||||
private static final int MAX_SPLITTING_ITERATIONS = 10;
|
||||
Set<Cell> cells;
|
||||
AffineTransform pageToPdfTransform;
|
||||
double minCellHeight;
|
||||
double minCellWidth;
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
TableGridStructureCalculator(Collection<Cell> cells, AffineTransform pdfToPageTransform) {
|
||||
|
||||
this.cells = new HashSet<>(cells);
|
||||
this.pageToPdfTransform = pdfToPageTransform.createInverse();
|
||||
this.minCellHeight = cells.stream()
|
||||
.mapToDouble(cell -> cell.getBBox().getHeight())
|
||||
.min().orElse(0);
|
||||
this.minCellWidth = cells.stream()
|
||||
.mapToDouble(cell -> cell.getBBox().getWidth())
|
||||
.min().orElse(0);
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Calculates the grid structure of the table. For spanning rows and columns multiple cells with the same values will be inserted.
|
||||
* Checks if any cell has more than one neighbor in any direction, if it does, it splits the cell according to its neighbors.
|
||||
* This is repeated until no more splits are necessary. Then the rows are computed using that very same linked neighbor structure starting with the top left cell.
|
||||
*
|
||||
* @return TablePageBlock Structure as a rows of cells matrix
|
||||
*/
|
||||
public List<List<Cell>> gridify() {
|
||||
|
||||
if (cellsHaveLargeOverlaps()) {
|
||||
// If cells overlap significantly, the logic below will keep splitting them infinitely, so we revert to the simpler area sweep implementation.
|
||||
return areaSweepFallback();
|
||||
}
|
||||
|
||||
var linkedCells = cells.stream()
|
||||
.map(LinkedCell::new)
|
||||
.collect(Collectors.toList());
|
||||
|
||||
computeNeighbours(linkedCells);
|
||||
int splits = 0;
|
||||
while (linkedCells.stream()
|
||||
.anyMatch(LinkedCell::needsSplit) && splits <= MAX_SPLITTING_ITERATIONS) {
|
||||
|
||||
List<LinkedCell> newCells = new LinkedList<>();
|
||||
for (LinkedCell linkedCell : linkedCells) {
|
||||
if (linkedCell.needsSplit()) {
|
||||
newCells.addAll(linkedCell.split());
|
||||
} else {
|
||||
newCells.add(linkedCell);
|
||||
}
|
||||
}
|
||||
computeNeighbours(newCells);
|
||||
linkedCells = newCells;
|
||||
splits++;
|
||||
}
|
||||
return buildStructure(linkedCells);
|
||||
}
|
||||
|
||||
|
||||
private List<List<Cell>> areaSweepFallback() {
|
||||
|
||||
List<List<Cell>> rows = AreaSweepGridifier.gridify(cells, pageToPdfTransform, minCellWidth, minCellHeight);
|
||||
rows = removeEmptyRows(rows);
|
||||
rows = removeEmptyCols(rows);
|
||||
return rows;
|
||||
}
|
||||
|
||||
|
||||
private boolean cellsHaveLargeOverlaps() {
|
||||
|
||||
for (Cell cell1 : cells) {
|
||||
for (Cell cell2 : cells) {
|
||||
if (cell1.equals(cell2)) {
|
||||
continue;
|
||||
}
|
||||
if (cell1.horizontalOverlap(cell2) > minCellWidth * DISTANCE_FACTOR //
|
||||
&& cell1.verticalOverlap(cell2) > minCellHeight * DISTANCE_FACTOR) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
private List<List<Cell>> buildStructure(List<LinkedCell> cells) {
|
||||
|
||||
if (cells.isEmpty()) {
|
||||
return Collections.emptyList();
|
||||
}
|
||||
List<List<Cell>> rows = buildRows(cells);
|
||||
if (isNotRectangular(rows)) {
|
||||
// For some tables the result is not rectangular, this either happens if cells are missing or the algorithm would need more than the max iterations to solve it.
|
||||
// This is unacceptable so we revert to the area sweep implementation, which by design will always produce a rectangular result.
|
||||
return areaSweepFallback();
|
||||
}
|
||||
rows = removeEmptyRows(rows);
|
||||
rows = removeEmptyCols(rows);
|
||||
return rows;
|
||||
}
|
||||
|
||||
|
||||
private boolean isNotRectangular(List<List<Cell>> rows) {
|
||||
|
||||
if (rows.isEmpty()) {
|
||||
return true;
|
||||
}
|
||||
int n = rows.get(0).size();
|
||||
return rows.stream()
|
||||
.anyMatch(row -> row.size() != n);
|
||||
}
|
||||
|
||||
|
||||
private List<List<Cell>> buildRows(List<LinkedCell> cells) {
|
||||
|
||||
List<LinkedCell> topLeftCandidates = cells.stream()
|
||||
.filter(LinkedCell::isTopLeft)
|
||||
.toList();
|
||||
|
||||
assert topLeftCandidates.size() == 1;
|
||||
var cell = topLeftCandidates.get(0);
|
||||
|
||||
List<List<Cell>> rows = new ArrayList<>();
|
||||
rows.add(buildRow(cell));
|
||||
while (!cell.belows.isEmpty()) {
|
||||
cell = cell.belows.get(0);
|
||||
rows.add(buildRow(cell));
|
||||
}
|
||||
if (isNotRectangular(rows)) {
|
||||
throw new AssertionError();
|
||||
}
|
||||
return rows;
|
||||
}
|
||||
|
||||
|
||||
private static List<Cell> buildRow(LinkedCell cell) {
|
||||
|
||||
List<Cell> currentRow = new ArrayList<>();
|
||||
LinkedCell nextCell = cell;
|
||||
currentRow.add(cell.originalCell);
|
||||
while (!nextCell.rights.isEmpty()) {
|
||||
nextCell = nextCell.rights.get(0);
|
||||
currentRow.add(nextCell.originalCell);
|
||||
}
|
||||
return currentRow;
|
||||
}
|
||||
|
||||
|
||||
private void computeNeighbours(List<LinkedCell> cells) {
|
||||
|
||||
for (LinkedCell cell : cells) {
|
||||
cell.resetNeighbours();
|
||||
computeNeighbours(cell, cells);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
private void computeNeighbours(LinkedCell cell, List<LinkedCell> otherCells) {
|
||||
|
||||
for (LinkedCell otherCell : otherCells) {
|
||||
if (cell.equals(otherCell)) {
|
||||
continue;
|
||||
}
|
||||
if (cell.originalCell.horizontalDistance(otherCell.originalCell) <= minCellWidth * DISTANCE_FACTOR
|
||||
&& cell.originalCell.verticalOverlap(otherCell.originalCell) >= minCellHeight * DISTANCE_FACTOR) {
|
||||
if (cell.originalCell.getBBox().getCenterX() <= otherCell.originalCell.getBBox().getCenterX()) {
|
||||
cell.rights.add(otherCell);
|
||||
} else {
|
||||
cell.lefts.add(otherCell);
|
||||
}
|
||||
} else if (cell.originalCell.verticalDistance(otherCell.originalCell) <= minCellHeight * DISTANCE_FACTOR
|
||||
&& cell.originalCell.horizontalOverlap(otherCell.originalCell) >= minCellWidth * DISTANCE_FACTOR) {
|
||||
if (cell.originalCell.getBBox().getCenterY() <= otherCell.originalCell.getBBox().getCenterY()) {
|
||||
cell.belows.add(otherCell);
|
||||
} else {
|
||||
cell.aboves.add(otherCell);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
static <T> List<List<T>> transpose(List<List<T>> table) {
|
||||
|
||||
List<List<T>> ret = new ArrayList<List<T>>();
|
||||
final int N = table.get(0).size();
|
||||
for (int i = 0; i < N; i++) {
|
||||
List<T> col = new ArrayList<T>();
|
||||
for (List<T> row : table) {
|
||||
col.add(row.get(i));
|
||||
}
|
||||
ret.add(col);
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
|
||||
private List<List<Cell>> removeEmptyCols(List<List<Cell>> rowsOfCells) {
|
||||
|
||||
if (rowsOfCells.isEmpty()) {
|
||||
return rowsOfCells;
|
||||
}
|
||||
|
||||
var colsOfCells = transpose(rowsOfCells);
|
||||
colsOfCells = removeEmptyRows(colsOfCells);
|
||||
return transpose(colsOfCells);
|
||||
}
|
||||
|
||||
|
||||
private List<List<Cell>> removeEmptyRows(List<List<Cell>> rowsOfCells) {
|
||||
|
||||
return rowsOfCells.stream()
|
||||
.filter(row -> row.stream()
|
||||
.anyMatch(cell -> !cell.getTextBlocks().isEmpty()))
|
||||
.collect(Collectors.toList());
|
||||
}
|
||||
|
||||
|
||||
class LinkedCell {
|
||||
|
||||
private final Cell originalCell;
|
||||
private final List<LinkedCell> rights;
|
||||
private final List<LinkedCell> lefts;
|
||||
private final List<LinkedCell> aboves;
|
||||
private final List<LinkedCell> belows;
|
||||
|
||||
|
||||
LinkedCell(Cell cell) {
|
||||
|
||||
this.originalCell = cell;
|
||||
this.rights = new LinkedList<>();
|
||||
this.lefts = new LinkedList<>();
|
||||
this.aboves = new LinkedList<>();
|
||||
this.belows = new LinkedList<>();
|
||||
}
|
||||
|
||||
|
||||
public boolean needsSplit() {
|
||||
|
||||
return rights.size() > 1 || lefts.size() > 1 || aboves.size() > 1 || belows.size() > 1;
|
||||
}
|
||||
|
||||
|
||||
public boolean isTopLeft() {
|
||||
|
||||
return lefts.isEmpty() && aboves.isEmpty();
|
||||
}
|
||||
|
||||
|
||||
public String toString() {
|
||||
|
||||
return originalCell.toString();
|
||||
}
|
||||
|
||||
|
||||
public Collection<LinkedCell> split() {
|
||||
|
||||
if (rights.size() > 1 && rights.size() >= lefts.size()) {
|
||||
return splitY(rights);
|
||||
}
|
||||
if (lefts.size() > 1) {
|
||||
return splitY(lefts);
|
||||
}
|
||||
if (aboves.size() > 1 && aboves.size() >= belows.size()) {
|
||||
return splitX(aboves);
|
||||
}
|
||||
if (belows.size() > 1) {
|
||||
return splitX(belows);
|
||||
}
|
||||
return List.of(this);
|
||||
}
|
||||
|
||||
|
||||
private List<LinkedCell> splitY(List<LinkedCell> neighbours) {
|
||||
|
||||
List<LinkedCell> splitCells = new LinkedList<>();
|
||||
List<Double> ySplit = neighbours.stream()
|
||||
.map(right -> right.originalCell.getMaxY())
|
||||
.sorted()
|
||||
.toList();
|
||||
Point2D topLeft = new Point2D.Double(originalCell.getBBox().getMinX(), originalCell.getBBox().getMinY());
|
||||
double maxX = originalCell.getBBox().getMaxX();
|
||||
double x = originalCell.getBBox().getX();
|
||||
double maxY = originalCell.getBBox().getMaxY();
|
||||
for (Double neighborY : ySplit) {
|
||||
double y = Math.min(neighborY, maxY);
|
||||
Point2D bottomRight = new Point2D.Double(maxX, y);
|
||||
Cell cell = copyCell(topLeft, bottomRight);
|
||||
splitCells.add(new LinkedCell(cell));
|
||||
topLeft = new Point2D.Double(x, y);
|
||||
}
|
||||
return splitCells;
|
||||
}
|
||||
|
||||
|
||||
private List<LinkedCell> splitX(List<LinkedCell> neighbours) {
|
||||
|
||||
List<LinkedCell> splitCells = new LinkedList<>();
|
||||
List<Double> xSplit = neighbours.stream()
|
||||
.map(right -> right.originalCell.getMaxX())
|
||||
.sorted()
|
||||
.toList();
|
||||
Point2D topLeft = new Point2D.Double(originalCell.getBBox().getMinX(), originalCell.getBBox().getMinY());
|
||||
double maxY = originalCell.getBBox().getMaxY();
|
||||
double y = originalCell.getBBox().getY();
|
||||
double maxX = originalCell.getBBox().getMaxX();
|
||||
for (Double neighborX : xSplit) {
|
||||
double x = Math.min(neighborX, maxX);
|
||||
Point2D bottomRight = new Point2D.Double(x, maxY);
|
||||
Cell cell = copyCell(topLeft, bottomRight);
|
||||
splitCells.add(new LinkedCell(cell));
|
||||
topLeft = new Point2D.Double(x, y);
|
||||
}
|
||||
return splitCells;
|
||||
}
|
||||
|
||||
|
||||
private Cell copyCell(Point2D topLeft, Point2D bottomRight) {
|
||||
|
||||
Cell cell = Cell.fromPageCoordinates(topLeft, bottomRight, pageToPdfTransform);
|
||||
cell.setHeaderCell(originalCell.isHeaderCell());
|
||||
cell.setTextBlocks(originalCell.getTextBlocks());
|
||||
return cell;
|
||||
}
|
||||
|
||||
|
||||
public void resetNeighbours() {
|
||||
|
||||
rights.clear();
|
||||
lefts.clear();
|
||||
aboves.clear();
|
||||
belows.clear();
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
@ -1,4 +1,4 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.utils;
|
||||
package com.knecon.fforesight.service.layoutparser.processor.services.tables;
|
||||
|
||||
import java.util.Collection;
|
||||
import java.util.Collections;
|
||||
@ -0,0 +1,113 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.services.visualization;
|
||||
|
||||
import java.awt.Color;
|
||||
import java.util.Objects;
|
||||
|
||||
import com.knecon.fforesight.service.ocr.v1.api.model.Figure;
|
||||
import com.knecon.fforesight.service.ocr.v1.api.model.IdpResult;
|
||||
import com.knecon.fforesight.service.ocr.v1.api.model.KeyValuePair;
|
||||
import com.knecon.fforesight.service.ocr.v1.api.model.QuadPoint;
|
||||
import com.knecon.fforesight.service.ocr.v1.api.model.Region;
|
||||
import com.knecon.fforesight.service.ocr.v1.api.model.Table;
|
||||
import com.knecon.fforesight.service.ocr.v1.api.model.TableCell;
|
||||
import com.knecon.fforesight.service.ocr.v1.api.model.TableCellType;
|
||||
import com.knecon.fforesight.service.ocr.v1.api.model.TextRegion;
|
||||
import com.knecon.fforesight.service.viewerdoc.layers.IdpLayerConfig;
|
||||
import com.knecon.fforesight.service.viewerdoc.model.ColoredLine;
|
||||
import com.knecon.fforesight.service.viewerdoc.model.FilledRectangle;
|
||||
import com.knecon.fforesight.service.viewerdoc.model.Visualizations;
|
||||
|
||||
import lombok.AccessLevel;
|
||||
import lombok.experimental.FieldDefaults;
|
||||
|
||||
@FieldDefaults(level = AccessLevel.PRIVATE, makeFinal = true)
|
||||
public class IdpResultLayer extends IdpLayerConfig {
|
||||
|
||||
public static final int LINE_WIDTH = 1;
|
||||
|
||||
|
||||
public IdpResultLayer(IdpResult result) {
|
||||
|
||||
result.tables()
|
||||
.forEach(this::addTable);
|
||||
result.keyValuePairs()
|
||||
.forEach(this::addKeyValue);
|
||||
result.figures()
|
||||
.forEach(this::addFigure);
|
||||
}
|
||||
|
||||
|
||||
private void addFigure(Figure figure) {
|
||||
|
||||
addRegion(figure.image(), figures, IMAGE_COLOR);
|
||||
if (figure.caption() != null) {
|
||||
addRegion(figure.caption().region(), figures, IMAGE_COLOR);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
private void addTable(Table table) {
|
||||
|
||||
for (Region bbox : table.bboxes()) {
|
||||
addRegion(bbox, tables, TABLE_COLOR);
|
||||
}
|
||||
for (TableCell cell : table.cells()) {
|
||||
addRegion(cell.textRegion().region(), tables, INNER_LINES_COLOR);
|
||||
if (Objects.equals(cell.kind(), TableCellType.ROW_HEADER) || Objects.equals(cell.kind(), TableCellType.COLUMN_HEADER)) {
|
||||
addRegionAsFilledRect(cell.textRegion().region(), tables, HEADER_CELL_COLOR);
|
||||
}
|
||||
}
|
||||
if (table.caption() != null) {
|
||||
addRegion(table.caption().region(), tables, TABLE_COLOR);
|
||||
}
|
||||
for (TextRegion footnote : table.footnotes()) {
|
||||
addRegion(footnote.region(), tables, FOOTNOTE_COLOR);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
private void addQuadPoint(int pageNumber, QuadPoint bbox, Visualizations vis, Color color) {
|
||||
|
||||
var visOnPage = getOrCreateVisualizationsOnPage(pageNumber, vis);
|
||||
bbox.asLines()
|
||||
.forEach(line -> visOnPage.getColoredLines().add(new ColoredLine(line, color, LINE_WIDTH)));
|
||||
}
|
||||
|
||||
|
||||
private void addRegion(Region region, Visualizations vis, Color color) {
|
||||
|
||||
var sectionsOnPage = getOrCreateVisualizationsOnPage(region.pageNumber(), vis);
|
||||
region.bbox().get().asLines()
|
||||
.forEach(line -> sectionsOnPage.getColoredLines().add(new ColoredLine(line, color, LINE_WIDTH)));
|
||||
}
|
||||
|
||||
|
||||
private void addRegionAsFilledRect(Region region, Visualizations vis, Color color) {
|
||||
|
||||
var sectionsOnPage = getOrCreateVisualizationsOnPage(region.pageNumber(), vis);
|
||||
sectionsOnPage.getFilledRectangles().add(new FilledRectangle(region.bbox().get().getBounds2D(), color, 0.2f));
|
||||
}
|
||||
|
||||
|
||||
public void addKeyValue(KeyValuePair keyValue) {
|
||||
|
||||
if (keyValue.key() != null) {
|
||||
addRegion(keyValue.key().region(), keyValuePairs, KEY_COLOR);
|
||||
}
|
||||
if (keyValue.value() != null) {
|
||||
addRegion(keyValue.value().region(), keyValuePairs, VALUE_COLOR);
|
||||
}
|
||||
if (keyValue.key() != null && keyValue.value() != null) {
|
||||
QuadPoint key = keyValue.key().region().bbox().get();
|
||||
QuadPoint value = keyValue.value().region().bbox().get();
|
||||
|
||||
var line = LineUtils.findClosestMidpointLine(key, value);
|
||||
var arrowHead = LineUtils.createArrowHead(line, Math.min(LineUtils.length(line), 5));
|
||||
var linesOnPage = getOrCreateVisualizationsOnPage(keyValue.key().region().pageNumber(), keyValuePairs).getColoredLines();
|
||||
linesOnPage.add(new ColoredLine(line, KEY_VALUE_BBOX_COLOR, LINE_WIDTH));
|
||||
linesOnPage.add(new ColoredLine(arrowHead[0], KEY_VALUE_BBOX_COLOR, LINE_WIDTH));
|
||||
linesOnPage.add(new ColoredLine(arrowHead[1], KEY_VALUE_BBOX_COLOR, LINE_WIDTH));
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
@ -1,6 +1,7 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.services.visualization;
|
||||
|
||||
import java.io.File;
|
||||
import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
|
||||
import org.springframework.stereotype.Service;
|
||||
@ -14,6 +15,7 @@ import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsi
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.DocumentWithVisualization;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.mapper.OutlineMapper;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.visualization.LayoutGrid;
|
||||
import com.knecon.fforesight.service.viewerdoc.layers.LayerGroup;
|
||||
import com.knecon.fforesight.service.viewerdoc.model.Outline;
|
||||
import com.knecon.fforesight.service.viewerdoc.service.PDFTronViewerDocumentService;
|
||||
|
||||
@ -48,16 +50,12 @@ public class LayoutGridService {
|
||||
document.layoutDebugLayer().addSentenceVisualization(document.document().getTextBlock());
|
||||
document.layoutDebugLayer().addOutlineHeadlines(document.document());
|
||||
|
||||
List<LayerGroup> layers = new LinkedList<>();
|
||||
layers.add(layoutGrid);
|
||||
if (document.layoutDebugLayer().isActive()) {
|
||||
viewerDocumentService.addLayerGroups(originFile,
|
||||
destinationFile,
|
||||
List.of(layoutGrid, document.layoutDebugLayer()),
|
||||
layoutParserVersion,
|
||||
layoutParsingTypeName,
|
||||
outline);
|
||||
} else {
|
||||
viewerDocumentService.addLayerGroups(originFile, destinationFile, List.of(layoutGrid), layoutParserVersion, layoutParsingTypeName, outline);
|
||||
layers.add(document.layoutDebugLayer());
|
||||
}
|
||||
viewerDocumentService.addLayerGroups(originFile, destinationFile, layers, layoutParserVersion, layoutParsingTypeName, outline);
|
||||
}
|
||||
|
||||
|
||||
|
||||
@ -0,0 +1,125 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.services.visualization;
|
||||
|
||||
import java.awt.Color;
|
||||
import java.awt.geom.AffineTransform;
|
||||
import java.awt.geom.Line2D;
|
||||
import java.awt.geom.Point2D;
|
||||
import java.util.List;
|
||||
|
||||
import com.knecon.fforesight.service.ocr.v1.api.model.QuadPoint;
|
||||
import com.knecon.fforesight.service.viewerdoc.model.ColoredLine;
|
||||
|
||||
import lombok.experimental.UtilityClass;
|
||||
|
||||
@UtilityClass
|
||||
public class LineUtils {
|
||||
|
||||
public List<ColoredLine> quadPointAsLines(QuadPoint rect, boolean tight) {
|
||||
|
||||
if (tight) {
|
||||
return List.of(new ColoredLine(new Line2D.Double(rect.a(), rect.b()), Color.GREEN, 1),
|
||||
new ColoredLine(new Line2D.Double(rect.b(), rect.c()), Color.GREEN, 1),
|
||||
new ColoredLine(new Line2D.Double(rect.c(), rect.d()), Color.GREEN, 1),
|
||||
new ColoredLine(new Line2D.Double(rect.d(), rect.a()), Color.GREEN, 1));
|
||||
}
|
||||
|
||||
return List.of(new ColoredLine(new Line2D.Double(rect.a(), rect.b()), Color.BLUE, 1),
|
||||
new ColoredLine(new Line2D.Double(rect.b(), rect.c()), Color.BLUE, 1),
|
||||
new ColoredLine(new Line2D.Double(rect.c(), rect.d()), Color.BLUE, 1),
|
||||
new ColoredLine(new Line2D.Double(rect.d(), rect.a()), Color.BLUE, 1));
|
||||
}
|
||||
|
||||
|
||||
public List<ColoredLine> quadPointAsLines(QuadPoint rect, Color color) {
|
||||
|
||||
return List.of(new ColoredLine(new Line2D.Double(rect.a(), rect.b()), color, 1),
|
||||
new ColoredLine(new Line2D.Double(rect.b(), rect.c()), color, 1),
|
||||
new ColoredLine(new Line2D.Double(rect.c(), rect.d()), color, 1),
|
||||
new ColoredLine(new Line2D.Double(rect.d(), rect.a()), color, 1));
|
||||
}
|
||||
|
||||
|
||||
public static Line2D transform(Line2D line2D, AffineTransform affineTransform) {
|
||||
|
||||
var p1 = affineTransform.transform(line2D.getP1(), null);
|
||||
var p2 = affineTransform.transform(line2D.getP2(), null);
|
||||
return new Line2D.Double(p1, p2);
|
||||
}
|
||||
|
||||
|
||||
public static double length(Line2D line2D) {
|
||||
|
||||
return line2D.getP1().distance(line2D.getP2());
|
||||
}
|
||||
|
||||
|
||||
public static Line2D findClosestMidpointLine(QuadPoint quad1, QuadPoint quad2) {
|
||||
|
||||
List<Line2D> lines1 = quad1.asLines()
|
||||
.toList();
|
||||
List<Line2D> lines2 = quad2.asLines()
|
||||
.toList();
|
||||
|
||||
Line2D closestLine1 = null;
|
||||
Line2D closestLine2 = null;
|
||||
double minDistance = Double.MAX_VALUE;
|
||||
|
||||
for (Line2D line1 : lines1) {
|
||||
for (Line2D line2 : lines2) {
|
||||
double distance = lineDistance(line1, line2);
|
||||
if (distance < minDistance) {
|
||||
minDistance = distance;
|
||||
closestLine1 = line1;
|
||||
closestLine2 = line2;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (closestLine1 == null || closestLine2 == null) {
|
||||
throw new IllegalStateException("Could not find closest lines");
|
||||
}
|
||||
|
||||
Point2D midpoint1 = getMidpoint(closestLine1);
|
||||
Point2D midpoint2 = getMidpoint(closestLine2);
|
||||
|
||||
return new Line2D.Double(midpoint1, midpoint2);
|
||||
}
|
||||
|
||||
|
||||
private static double lineDistance(Line2D line1, Line2D line2) {
|
||||
|
||||
return Math.abs(getMidpoint(line1).distance(getMidpoint(line2)));
|
||||
}
|
||||
|
||||
|
||||
private static Point2D getMidpoint(Line2D line) {
|
||||
|
||||
double x = (line.getX1() + line.getX2()) / 2;
|
||||
double y = (line.getY1() + line.getY2()) / 2;
|
||||
return new Point2D.Double(x, y);
|
||||
}
|
||||
|
||||
|
||||
public static Line2D[] createArrowHead(Line2D line, double arrowLength) {
|
||||
|
||||
Point2D start = line.getP1();
|
||||
Point2D end = line.getP2();
|
||||
|
||||
// Calculate the angle of the line
|
||||
double angle = Math.atan2(end.getY() - start.getY(), end.getX() - start.getX());
|
||||
|
||||
// Calculate the points for the two arrow lines
|
||||
double arrowHeadAngle = Math.PI / 6;
|
||||
double x1 = end.getX() - arrowLength * Math.cos(angle - arrowHeadAngle);
|
||||
double y1 = end.getY() - arrowLength * Math.sin(angle - arrowHeadAngle);
|
||||
double x2 = end.getX() - arrowLength * Math.cos(angle + arrowHeadAngle);
|
||||
double y2 = end.getY() - arrowLength * Math.sin(angle + arrowHeadAngle);
|
||||
|
||||
// Create and return the two arrow lines
|
||||
Line2D arrow1 = new Line2D.Double(end, new Point2D.Double(x1, y1));
|
||||
Line2D arrow2 = new Line2D.Double(end, new Point2D.Double(x2, y2));
|
||||
|
||||
return new Line2D[]{arrow1, arrow2};
|
||||
}
|
||||
|
||||
}
|
||||
@ -1,34 +0,0 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.utils;
|
||||
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import com.iqser.red.service.redaction.v1.server.model.document.nodes.Page;
|
||||
|
||||
import lombok.experimental.UtilityClass;
|
||||
|
||||
@UtilityClass
|
||||
public class BBoxMergingUtility {
|
||||
|
||||
public Map<Page, Rectangle2D> mergeBBoxes(List<Map<Page, Rectangle2D>> bboxesToMerge) {
|
||||
|
||||
Map<Page, Rectangle2D> bBoxPerPage = new HashMap<>();
|
||||
Set<Page> pages = bboxesToMerge.stream()
|
||||
.flatMap(map -> map.keySet()
|
||||
.stream())
|
||||
.collect(Collectors.toSet());
|
||||
for (Page page : pages) {
|
||||
Rectangle2D bBoxOnPage = bboxesToMerge.stream()
|
||||
.filter(childBboxPerPage -> childBboxPerPage.containsKey(page))
|
||||
.map(childBboxPerPage -> childBboxPerPage.get(page))
|
||||
.collect(RectangleTransformations.collectBBox());
|
||||
bBoxPerPage.put(page, bBoxOnPage);
|
||||
}
|
||||
return bBoxPerPage;
|
||||
}
|
||||
|
||||
}
|
||||
@ -2,6 +2,8 @@ package com.knecon.fforesight.service.layoutparser.processor.utils;
|
||||
|
||||
import java.awt.geom.AffineTransform;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.PageInformation;
|
||||
|
||||
import lombok.SneakyThrows;
|
||||
import lombok.experimental.UtilityClass;
|
||||
|
||||
|
||||
@ -4,12 +4,14 @@ import java.awt.geom.Point2D;
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.util.Comparator;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.BoundingBox;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.Cell;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling;
|
||||
|
||||
public class GeometricComparators {
|
||||
|
||||
private static final int COMPARATOR_ROUNDING = 2;
|
||||
static final float VERTICAL_COMPARISON_THRESHOLD = 0.4f;
|
||||
|
||||
public static final Comparator<Point2D> X_FIRST_POINT_COMPARATOR = (point1, point2) -> {
|
||||
|
||||
@ -58,6 +60,17 @@ public class GeometricComparators {
|
||||
return cell1Size.compareTo(cell2Size);
|
||||
};
|
||||
|
||||
public static final Comparator<BoundingBox> CELL_SORTER = (o1, o2) -> {
|
||||
|
||||
if (o1.equals(o2)) {
|
||||
return 0;
|
||||
}
|
||||
if (o1.verticalOverlapPdf(o2) > VERTICAL_COMPARISON_THRESHOLD * ((o1.getHeight() + o2.getHeight()) / 2)) {
|
||||
return Double.compare(o1.getMinX(), o2.getMinX());
|
||||
} else {
|
||||
return Double.compare(o1.getMaxY(), o2.getMaxY());
|
||||
}
|
||||
};
|
||||
public static final Comparator<Rectangle2D> RECTANGLE_SIZE_COMPARATOR = (rect1, rect2) -> {
|
||||
|
||||
Double rect1Size = rect1.getHeight() * rect1.getWidth();
|
||||
|
||||
@ -1,59 +0,0 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.utils;
|
||||
|
||||
import java.awt.geom.Rectangle2D;
|
||||
|
||||
import org.apache.pdfbox.pdmodel.PDPage;
|
||||
import org.apache.pdfbox.pdmodel.common.PDRectangle;
|
||||
|
||||
import com.iqser.red.service.redaction.v1.server.model.document.nodes.Page;
|
||||
|
||||
public record PageInformation(Rectangle2D mediabox, int number, int rotationDegrees) {
|
||||
|
||||
public static PageInformation fromPDPage(int pageNum, PDPage page) {
|
||||
|
||||
PDRectangle mediaBox = page.getMediaBox();
|
||||
return new PageInformation(new Rectangle2D.Double(mediaBox.getLowerLeftX(), mediaBox.getLowerLeftY(), mediaBox.getWidth(), mediaBox.getHeight()),
|
||||
pageNum,
|
||||
page.getRotation());
|
||||
}
|
||||
|
||||
|
||||
public static PageInformation fromPage(Page page) {
|
||||
|
||||
return new PageInformation(new Rectangle2D.Double(0, 0, page.getWidth(), page.getHeight()), page.getNumber(), page.getRotation());
|
||||
}
|
||||
|
||||
|
||||
public double height() {
|
||||
|
||||
return mediabox.getHeight();
|
||||
}
|
||||
|
||||
|
||||
public double heightRot() {
|
||||
|
||||
if (rotationDegrees == 90 || rotationDegrees == 270) {
|
||||
return width();
|
||||
}
|
||||
return height();
|
||||
}
|
||||
|
||||
|
||||
public double width() {
|
||||
|
||||
return mediabox.getWidth();
|
||||
}
|
||||
|
||||
|
||||
public double minX() {
|
||||
|
||||
return mediabox.getX();
|
||||
}
|
||||
|
||||
|
||||
public double minY() {
|
||||
|
||||
return mediabox.getY();
|
||||
}
|
||||
|
||||
}
|
||||
@ -1,42 +0,0 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.utils;
|
||||
|
||||
import java.io.BufferedOutputStream;
|
||||
import java.io.File;
|
||||
import java.io.FileOutputStream;
|
||||
import java.io.IOException;
|
||||
import java.util.zip.GZIPOutputStream;
|
||||
|
||||
import com.google.protobuf.Message;
|
||||
import com.google.protobuf.MessageOrBuilder;
|
||||
import com.google.protobuf.Struct;
|
||||
import com.google.protobuf.util.JsonFormat;
|
||||
|
||||
import lombok.SneakyThrows;
|
||||
import lombok.experimental.UtilityClass;
|
||||
|
||||
@UtilityClass
|
||||
public class ProtobufUtil {
|
||||
|
||||
public static String toJson(MessageOrBuilder messageOrBuilder) throws IOException {
|
||||
return JsonFormat.printer().print(messageOrBuilder);
|
||||
}
|
||||
|
||||
@SuppressWarnings("unchecked")
|
||||
public static Message fromJson(String json) throws IOException {
|
||||
Message.Builder structBuilder = Struct.newBuilder();
|
||||
JsonFormat.parser().ignoringUnknownFields().merge(json, structBuilder);
|
||||
return structBuilder.build();
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
public <T extends Message> File serializeToTempFile(T any) {
|
||||
var tempFile = File.createTempFile("storage-protobuf", ".data");
|
||||
|
||||
try (var fos = new GZIPOutputStream(new BufferedOutputStream(new FileOutputStream(tempFile)))) {
|
||||
any.writeTo(fos);
|
||||
return tempFile;
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
@ -2,10 +2,12 @@ package com.knecon.fforesight.service.layoutparser.processor.utils;
|
||||
|
||||
import static java.lang.String.format;
|
||||
|
||||
import java.awt.geom.AffineTransform;
|
||||
import java.awt.geom.Point2D;
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.awt.geom.RectangularShape;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collection;
|
||||
import java.util.Collections;
|
||||
import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
@ -125,7 +127,7 @@ public class RectangleTransformations {
|
||||
}
|
||||
|
||||
|
||||
public static Rectangle2D rectangle2DBBox(List<Rectangle2D> rectangle2DList) {
|
||||
public static Rectangle2D rectangle2DBBox(Collection<Rectangle2D> rectangle2DList) {
|
||||
|
||||
return rectangle2DList.stream()
|
||||
.collect(new Rectangle2DBBoxCollector());
|
||||
@ -185,6 +187,12 @@ public class RectangleTransformations {
|
||||
}
|
||||
|
||||
|
||||
public static Rectangle2D transform(Rectangle2D rect, AffineTransform transform) {
|
||||
|
||||
return transform.createTransformedShape(rect).getBounds2D();
|
||||
}
|
||||
|
||||
|
||||
private static class Rectangle2DBBoxCollector implements Collector<Rectangle2D, Rectangle2DBBoxCollector.BBox, Rectangle2D> {
|
||||
|
||||
@Override
|
||||
|
||||
@ -13,7 +13,7 @@ import java.util.stream.Collectors;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.AngleFilter;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.Line;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.TextBoundingBox;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.UnionFind;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.UnionFind;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.docstrum.utils.DoubleUtils;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.docstrum.utils.FastAtan2;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
|
||||
@ -46,6 +46,12 @@ public class TextPositionOperations {
|
||||
return sortUsingLineDetection(sequences);
|
||||
}
|
||||
|
||||
public List<Word> mergeAndSort(TextPageBlock textBlocks) {
|
||||
|
||||
var sequences = new HashSet<>(textBlocks.getWords());
|
||||
return sortUsingLineDetection(sequences);
|
||||
}
|
||||
|
||||
|
||||
public List<Word> sort(List<Word> sequences) {
|
||||
|
||||
|
||||
@ -1,44 +0,0 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.utils;
|
||||
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
|
||||
// simple implementation of a disjoint-set data structure
|
||||
// https://en.wikipedia.org/wiki/Disjoint-set_data_structure
|
||||
public class UnionFind<T> {
|
||||
|
||||
Map<T, T> parents = new HashMap<>();
|
||||
Map<T, Integer> numberOfObjects = new HashMap<>();
|
||||
|
||||
|
||||
public T find(T node) {
|
||||
|
||||
if (!parents.containsKey(node)) {
|
||||
parents.put(node, node);
|
||||
numberOfObjects.put(node, 1);
|
||||
}
|
||||
if (!node.equals(parents.get(node))) {
|
||||
parents.put(node, find(parents.get(node)));
|
||||
}
|
||||
return parents.get(node);
|
||||
}
|
||||
|
||||
|
||||
public void union(T node1, T node2) {
|
||||
|
||||
T root1 = find(node1);
|
||||
T root2 = find(node2);
|
||||
|
||||
if (!root1.equals(root2)) {
|
||||
if (numberOfObjects.getOrDefault(root1, 1) < numberOfObjects.getOrDefault(root2, 1)) {
|
||||
parents.put(root1, root2);
|
||||
numberOfObjects.put(root2, numberOfObjects.get(root2) + numberOfObjects.get(root1));
|
||||
} else {
|
||||
parents.put(root2, root1);
|
||||
numberOfObjects.put(root1, numberOfObjects.get(root1) + numberOfObjects.get(root2));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@ -17,7 +17,6 @@ import java.util.concurrent.atomic.AtomicInteger;
|
||||
import org.apache.pdfbox.pdmodel.documentinterchange.markedcontent.PDMarkedContent;
|
||||
|
||||
import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.Rectangle;
|
||||
import com.iqser.red.service.redaction.v1.server.data.LayoutEngineProto;
|
||||
import com.iqser.red.service.redaction.v1.server.model.document.TextRange;
|
||||
import com.iqser.red.service.redaction.v1.server.model.document.nodes.Document;
|
||||
import com.iqser.red.service.redaction.v1.server.model.document.nodes.LayoutEngine;
|
||||
@ -36,7 +35,7 @@ import com.knecon.fforesight.service.layoutparser.processor.model.text.Word;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.classification.NumberWord;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.CoordinateTransforms;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.MarkedContentUtils;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.PageInformation;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.PageInformation;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations;
|
||||
import com.knecon.fforesight.service.viewerdoc.layers.LayoutDebugLayerConfig;
|
||||
import com.knecon.fforesight.service.viewerdoc.model.ColoredLine;
|
||||
@ -59,7 +58,7 @@ import lombok.experimental.FieldDefaults;
|
||||
@FieldDefaults(level = AccessLevel.PRIVATE)
|
||||
public class LayoutDebugLayer extends LayoutDebugLayerConfig {
|
||||
|
||||
boolean active;
|
||||
boolean active = true;
|
||||
|
||||
Map<Integer, AtomicInteger> outlineObjectsWithoutPointsPerPage = new HashMap<>();
|
||||
|
||||
@ -141,7 +140,7 @@ public class LayoutDebugLayer extends LayoutDebugLayerConfig {
|
||||
}
|
||||
|
||||
|
||||
public void addCellVisualizations(List<? extends BoundingBox> cells, int pageNumber) {
|
||||
public void addCellVisualizations(Collection<? extends BoundingBox> cells, int pageNumber, Color color) {
|
||||
|
||||
if (!active) {
|
||||
return;
|
||||
@ -149,7 +148,7 @@ public class LayoutDebugLayer extends LayoutDebugLayerConfig {
|
||||
VisualizationsOnPage visualizationsOnPage = getOrCreateVisualizationsOnPage(pageNumber, this.cells);
|
||||
visualizationsOnPage.getColoredRectangles()
|
||||
.addAll(cells.stream()
|
||||
.map(cell -> new ColoredRectangle(cell.getBBoxPdf(), CELLS_COLOR, 1))
|
||||
.map(cell -> new ColoredRectangle(cell.getBBoxPdf(), color == null ? CELLS_COLOR : color, 1))
|
||||
.toList());
|
||||
}
|
||||
|
||||
@ -211,7 +210,7 @@ public class LayoutDebugLayer extends LayoutDebugLayerConfig {
|
||||
}
|
||||
|
||||
|
||||
public void addTextBlockVisualizations(List<AbstractPageBlock> textPageBlocks, int page) {
|
||||
public void addTextBlockVisualizations(List<? extends AbstractPageBlock> textPageBlocks, int page) {
|
||||
|
||||
if (!active) {
|
||||
return;
|
||||
|
||||
@ -26,9 +26,8 @@ import com.iqser.red.service.redaction.v1.server.model.document.nodes.SemanticNo
|
||||
import com.iqser.red.service.redaction.v1.server.model.document.nodes.SuperSection;
|
||||
import com.iqser.red.service.redaction.v1.server.model.document.nodes.Table;
|
||||
import com.iqser.red.service.redaction.v1.server.model.document.nodes.TableCell;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.PageInformation;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.CoordinateTransforms;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.PageInformation;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations;
|
||||
import com.knecon.fforesight.service.viewerdoc.layers.LayoutGridLayerConfig;
|
||||
import com.knecon.fforesight.service.viewerdoc.model.ColoredLine;
|
||||
@ -93,19 +92,10 @@ public class LayoutGrid extends LayoutGridLayerConfig {
|
||||
public void addTreeId(SemanticNode semanticNode) {
|
||||
|
||||
Page page = semanticNode.getFirstPage();
|
||||
if (semanticNode.getBBox()
|
||||
.get(page) == null) {
|
||||
if (semanticNode.getBBox().get(page) == null) {
|
||||
return;
|
||||
}
|
||||
addPlacedText(page,
|
||||
semanticNode.getBBox()
|
||||
.get(page),
|
||||
semanticNode.getBBox()
|
||||
.get(page),
|
||||
buildTreeIdString(semanticNode),
|
||||
1,
|
||||
treeIds,
|
||||
TREEID_COLOR);
|
||||
addPlacedText(page, semanticNode.getBBox().get(page), semanticNode.getBBox().get(page), buildTreeIdString(semanticNode), 1, treeIds, TREEID_COLOR);
|
||||
}
|
||||
|
||||
|
||||
@ -134,8 +124,7 @@ public class LayoutGrid extends LayoutGridLayerConfig {
|
||||
.toList();
|
||||
Integer maxChildDepth = subSections.stream()
|
||||
.map(node -> node.getTreeId().size())
|
||||
.max(Integer::compareTo)
|
||||
.orElse(section.getTreeId().size());
|
||||
.max(Integer::compareTo).orElse(section.getTreeId().size());
|
||||
int ownDepth = section.getTreeId().size();
|
||||
|
||||
Page firstPage = section.getFirstPage();
|
||||
@ -321,8 +310,7 @@ public class LayoutGrid extends LayoutGridLayerConfig {
|
||||
Visualizations visualizations = semanticNode.getType().equals(NodeType.TABLE_OF_CONTENTS) ? toc : sections;
|
||||
List<ColoredLine> coloredLines = getOrCreateVisualizationsOnPage(page.getNumber(), visualizations).getColoredLines();
|
||||
int lineWidthModifier = maxChildDepth - ownDepth;
|
||||
Rectangle2D r = RectangleTransformations.pad(semanticNode.getBBox()
|
||||
.get(page), LINE_WIDTH * (1 + lineWidthModifier), LINE_WIDTH * (1 + lineWidthModifier));
|
||||
Rectangle2D r = RectangleTransformations.pad(semanticNode.getBBox().get(page), LINE_WIDTH * (1 + lineWidthModifier), LINE_WIDTH * (1 + lineWidthModifier));
|
||||
|
||||
SemanticNode highestParent = semanticNode.getHighestParent();
|
||||
Rectangle2D highestParentRect = rectangleMap.get(new RectangleIdentifier(highestParent.getTreeId(), page.getNumber()));
|
||||
@ -371,8 +359,7 @@ public class LayoutGrid extends LayoutGridLayerConfig {
|
||||
List<Double> ys = yStream.collect(Collectors.toList());
|
||||
ys.remove(0);
|
||||
|
||||
Rectangle2D tableBBox = table.getBBox()
|
||||
.get(page);
|
||||
Rectangle2D tableBBox = table.getBBox().get(page);
|
||||
List<ColoredLine> coloredLines = getOrCreateVisualizationsOnPage(page.getNumber(), tables).getColoredLines();
|
||||
|
||||
xs.forEach(x -> {
|
||||
|
||||
@ -0,0 +1,60 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.services.tables;
|
||||
|
||||
import static org.junit.jupiter.api.Assertions.*;
|
||||
|
||||
import java.awt.geom.AffineTransform;
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.util.Iterator;
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
import java.util.stream.Stream;
|
||||
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.Cell;
|
||||
|
||||
class TableAreaFillerTest {
|
||||
|
||||
@Test
|
||||
void findMissingCells() {
|
||||
|
||||
Rectangle2D area = new Rectangle2D.Double(0, 0, 2, 2);
|
||||
List<Rectangle2D> rectangles = List.of(new Rectangle2D.Double(0, 0, 1, 1), new Rectangle2D.Double(1, 1, 1, 1), new Rectangle2D.Double(1, 0, 1, 1));
|
||||
Set<Rectangle2D> missing = TableAreaFiller.findMissingRects(rectangles, area);
|
||||
|
||||
assertEquals(1, missing.size());
|
||||
assertEquals(new Rectangle2D.Double(0, 1, 1, 1), missing.iterator().next());
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
void findMissingCells2() {
|
||||
|
||||
Rectangle2D area = new Rectangle2D.Double(0, 0, 3, 3);
|
||||
List<Rectangle2D> rectangles = List.of(new Rectangle2D.Double(0, 0, 1, 1),
|
||||
new Rectangle2D.Double(1, 0, 1, 1),
|
||||
new Rectangle2D.Double(2, 0, 1, 1),
|
||||
new Rectangle2D.Double(0, 1, 1, 1),
|
||||
new Rectangle2D.Double(1, 1, 1, 1),
|
||||
new Rectangle2D.Double(2, 1, 1, 1));
|
||||
|
||||
|
||||
var missing = TableAreaFiller.findMissingRects(rectangles, area);
|
||||
assertEquals(1, missing.size());
|
||||
assertEquals(new Rectangle2D.Double(0, 2, 3, 1), missing.iterator().next());
|
||||
}
|
||||
|
||||
@Test
|
||||
void findMissingCells3() {
|
||||
|
||||
Rectangle2D area = new Rectangle2D.Double(0, 0, 2, 2);
|
||||
List<Rectangle2D> rectangles = List.of(new Rectangle2D.Double(0, 0, 1, 1));
|
||||
Set<Rectangle2D> missing = TableAreaFiller.findMissingRects(rectangles, area);
|
||||
|
||||
assertEquals(2, missing.size());
|
||||
Iterator<Rectangle2D> iterator = missing.iterator();
|
||||
assertEquals(new Rectangle2D.Double(0, 1, 2, 1), iterator.next());
|
||||
assertEquals(new Rectangle2D.Double(1, 0, 1, 1), iterator.next());
|
||||
}
|
||||
|
||||
}
|
||||
@ -75,6 +75,7 @@ public abstract class AbstractTest {
|
||||
protected final static String TENANT_ID = "tenant";
|
||||
protected final static String VIEWER_DOCUMENT_ID = "viewer";
|
||||
protected final static String SIMPLIFIED_ID = "simplified";
|
||||
protected final static String IDP_ID = "idp";
|
||||
|
||||
|
||||
protected LayoutParsingRequest buildStandardLayoutParsingRequest() {
|
||||
@ -117,7 +118,14 @@ public abstract class AbstractTest {
|
||||
|
||||
public static LayoutParsingRequest buildDefaultLayoutParsingRequest(String fileName, LayoutParsingType layoutParsingType, boolean debug) {
|
||||
|
||||
return buildDefaultLayoutParsingRequest(fileName, layoutParsingType, debug, false);
|
||||
}
|
||||
|
||||
|
||||
public static LayoutParsingRequest buildDefaultLayoutParsingRequest(String fileName, LayoutParsingType layoutParsingType, boolean debug, boolean withIdpResult) {
|
||||
|
||||
var identifier = debug ? Map.of("fileId", fileName, "debug", "true") : Map.of("fileId", fileName);
|
||||
Optional<String> idpResultStorageId = withIdpResult ? Optional.of(fileName + IDP_ID) : Optional.empty();
|
||||
return LayoutParsingRequest.builder()
|
||||
.identifier(identifier)
|
||||
.layoutParsingType(layoutParsingType)
|
||||
@ -132,6 +140,7 @@ public abstract class AbstractTest {
|
||||
.simplifiedTextStorageId(fileName + SIMPLIFIED_ID)
|
||||
.viewerDocumentStorageId(fileName + VIEWER_DOCUMENT_ID)
|
||||
.documentMarkdownFileStorageId(Optional.of(fileName + MARKDOWN_FILE_ID))
|
||||
.idpResultStorageId(idpResultStorageId)
|
||||
.build();
|
||||
}
|
||||
|
||||
|
||||
@ -34,6 +34,7 @@ import com.knecon.fforesight.service.layoutparser.processor.services.factory.Doc
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.mapper.TaasDocumentDataMapper;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.DrawingOptions;
|
||||
import com.knecon.fforesight.service.layoutparser.server.utils.visualizations.PdfDraw;
|
||||
import com.knecon.fforesight.service.ocr.v1.api.model.IdpResult;
|
||||
|
||||
import lombok.SneakyThrows;
|
||||
|
||||
@ -51,11 +52,12 @@ public class BdrJsonBuildTest extends AbstractTest {
|
||||
|
||||
return DocumentGraphFactory.buildDocumentGraph(LayoutParsingType.CLARIFYND,
|
||||
layoutParsingPipeline.parseLayout(LayoutParsingType.CLARIFYND,
|
||||
file,
|
||||
new ImageServiceResponse(),
|
||||
new TableServiceResponse(),
|
||||
new VisualLayoutParsingResponse(),
|
||||
Map.of("file",file.toString()))).document();
|
||||
file,
|
||||
new ImageServiceResponse(),
|
||||
new TableServiceResponse(),
|
||||
IdpResult.empty(),
|
||||
new VisualLayoutParsingResponse(),
|
||||
Map.of("file",file.toString()))).document();
|
||||
}
|
||||
|
||||
|
||||
|
||||
@ -16,6 +16,7 @@ import com.knecon.fforesight.service.layoutparser.processor.model.DocumentWithVi
|
||||
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableServiceResponse;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.VisualLayoutParsingResponse;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.factory.DocumentGraphFactory;
|
||||
import com.knecon.fforesight.service.ocr.v1.api.model.IdpResult;
|
||||
|
||||
import lombok.SneakyThrows;
|
||||
|
||||
@ -39,6 +40,7 @@ public abstract class BuildDocumentTest extends AbstractTest {
|
||||
fileResource,
|
||||
layoutParsingStorageService.getImagesFile(IMAGE_FILE_ID),
|
||||
new TableServiceResponse(),
|
||||
IdpResult.empty(),
|
||||
new VisualLayoutParsingResponse(),
|
||||
Map.of("file", filename, "debug", "true"));
|
||||
}
|
||||
@ -63,6 +65,7 @@ public abstract class BuildDocumentTest extends AbstractTest {
|
||||
layoutParsingStorageService.getImagesFile(layoutParsingRequest.imagesFileStorageId()
|
||||
.get()),
|
||||
new TableServiceResponse(),
|
||||
IdpResult.empty(),
|
||||
new VisualLayoutParsingResponse(),
|
||||
layoutParsingRequest.identifier()));
|
||||
} else {
|
||||
|
||||
@ -30,6 +30,7 @@ import com.knecon.fforesight.service.layoutparser.processor.python_api.model.ima
|
||||
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableServiceResponse;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.VisualLayoutParsingResponse;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.factory.DocumentGraphFactory;
|
||||
import com.knecon.fforesight.service.ocr.v1.api.model.IdpResult;
|
||||
import com.knecon.fforesight.tenantcommons.TenantsClient;
|
||||
|
||||
import lombok.AllArgsConstructor;
|
||||
@ -106,6 +107,7 @@ public class HeadlinesGoldStandardIntegrationTest {
|
||||
pdfFileResource.getFile(),
|
||||
new ImageServiceResponse(),
|
||||
new TableServiceResponse(),
|
||||
IdpResult.empty(),
|
||||
new VisualLayoutParsingResponse(),
|
||||
Map.of("file", filePath))).document();
|
||||
var foundHeadlines = documentGraph.streamAllSubNodes()
|
||||
|
||||
@ -1,6 +1,7 @@
|
||||
package com.knecon.fforesight.service.layoutparser.server;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.FileInputStream;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.util.Arrays;
|
||||
@ -24,6 +25,7 @@ import lombok.SneakyThrows;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
|
||||
@Slf4j
|
||||
@Disabled
|
||||
public class LayoutparserEnd2EndTest extends AbstractTest {
|
||||
|
||||
public static final LayoutParsingType LAYOUT_PARSING_TYPE = LayoutParsingType.DOCUMINE_OLD;
|
||||
@ -33,15 +35,24 @@ public class LayoutparserEnd2EndTest extends AbstractTest {
|
||||
|
||||
|
||||
@Test
|
||||
@Disabled
|
||||
public void testLayoutParserEndToEnd() {
|
||||
|
||||
String filePath = "/home/kschuettler/Dokumente/LayoutparsingEvaluation/VV-340050.pdf";
|
||||
String filePath = "/home/kschuettler/Dokumente/TestFiles/OCR/TestSet/VV-331340-first100.pdf";
|
||||
|
||||
runForFile(filePath);
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
public void testLayoutParserEndToEndWithIdpResult() {
|
||||
|
||||
String filePath = "/home/kschuettler/Dokumente/Ticket Related/RED-8670/VV-331340-first100.pdf/viewerDocument.pdf";
|
||||
String idpResultPath = "/home/kschuettler/Dokumente/Ticket Related/RED-8670/VV-331340-first100.pdf/idpResult.json";
|
||||
|
||||
runForFile(filePath, idpResultPath);
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
@Disabled
|
||||
@SneakyThrows
|
||||
@ -62,9 +73,15 @@ public class LayoutparserEnd2EndTest extends AbstractTest {
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
private void runForFile(String filePath) {
|
||||
|
||||
runForFile(filePath, null);
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
private void runForFile(String filePath, String idpResultPath) {
|
||||
|
||||
String fileName = Path.of(filePath).getFileName().toString();
|
||||
File file;
|
||||
if (filePath.startsWith("files")) { // from resources
|
||||
@ -73,7 +90,13 @@ public class LayoutparserEnd2EndTest extends AbstractTest {
|
||||
file = new File(filePath);
|
||||
}
|
||||
|
||||
LayoutParsingRequest layoutParsingRequest = buildDefaultLayoutParsingRequest(fileName, LAYOUT_PARSING_TYPE, true);
|
||||
LayoutParsingRequest layoutParsingRequest = buildDefaultLayoutParsingRequest(fileName, LAYOUT_PARSING_TYPE, true, true);
|
||||
|
||||
if (layoutParsingRequest.idpResultStorageId().isPresent() && idpResultPath != null) {
|
||||
try (var in = new FileInputStream(idpResultPath)) {
|
||||
storageService.storeObject(TENANT_ID, layoutParsingRequest.idpResultStorageId().get(), in);
|
||||
}
|
||||
}
|
||||
|
||||
prepareStorage(layoutParsingRequest, file);
|
||||
|
||||
|
||||
@ -32,6 +32,7 @@ import com.knecon.fforesight.service.layoutparser.processor.python_api.model.tab
|
||||
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.VisualLayoutParsingResponse;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.factory.DocumentGraphFactory;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.visualization.LayoutGridService;
|
||||
import com.knecon.fforesight.service.ocr.v1.api.model.IdpResult;
|
||||
import com.knecon.fforesight.service.viewerdoc.service.PDFTronViewerDocumentService;
|
||||
|
||||
import lombok.SneakyThrows;
|
||||
@ -192,6 +193,7 @@ public class OutlineDetectionTest extends AbstractTest {
|
||||
fileResource,
|
||||
layoutParsingStorageService.getImagesFile(IMAGE_FILE_ID),
|
||||
new TableServiceResponse(),
|
||||
IdpResult.empty(),
|
||||
new VisualLayoutParsingResponse(),
|
||||
Map.of("file", filename, "debug", "true"));
|
||||
}
|
||||
@ -209,6 +211,7 @@ public class OutlineDetectionTest extends AbstractTest {
|
||||
layoutParsingStorageService.getImagesFile(layoutParsingRequest.imagesFileStorageId()
|
||||
.get()),
|
||||
new TableServiceResponse(),
|
||||
IdpResult.empty(),
|
||||
new VisualLayoutParsingResponse(),
|
||||
layoutParsingRequest.identifier()));
|
||||
} else {
|
||||
|
||||
@ -20,6 +20,7 @@ import com.knecon.fforesight.service.layoutparser.processor.python_api.model.tab
|
||||
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.VisualLayoutParsingResponse;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.SimplifiedSectionTextService;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.factory.DocumentGraphFactory;
|
||||
import com.knecon.fforesight.service.ocr.v1.api.model.IdpResult;
|
||||
|
||||
import lombok.SneakyThrows;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
@ -61,6 +62,7 @@ public class SimplifiedTextServiceTest extends AbstractTest {
|
||||
file,
|
||||
new ImageServiceResponse(),
|
||||
new TableServiceResponse(),
|
||||
IdpResult.empty(),
|
||||
new VisualLayoutParsingResponse(),
|
||||
Map.of("file", file.toString()))).document();
|
||||
}
|
||||
|
||||
@ -21,6 +21,7 @@ import com.knecon.fforesight.service.layoutparser.processor.python_api.model.tab
|
||||
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.VisualLayoutParsingResponse;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.factory.DocumentGraphFactory;
|
||||
import com.knecon.fforesight.service.layoutparser.server.BuildDocumentTest;
|
||||
import com.knecon.fforesight.service.ocr.v1.api.model.IdpResult;
|
||||
|
||||
import lombok.SneakyThrows;
|
||||
|
||||
@ -58,11 +59,12 @@ public class DocumentGraphJsonWritingTest extends BuildDocumentTest {
|
||||
|
||||
Document documentGraph = DocumentGraphFactory.buildDocumentGraph(LayoutParsingType.REDACT_MANAGER_WITHOUT_DUPLICATE_PARAGRAPH,
|
||||
layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER_WITHOUT_DUPLICATE_PARAGRAPH,
|
||||
filename.toFile(),
|
||||
new ImageServiceResponse(),
|
||||
new TableServiceResponse(),
|
||||
new VisualLayoutParsingResponse(),
|
||||
Map.of("file",filename.toFile().toString()))).document();
|
||||
filename.toFile(),
|
||||
new ImageServiceResponse(),
|
||||
new TableServiceResponse(),
|
||||
IdpResult.empty(),
|
||||
new VisualLayoutParsingResponse(),
|
||||
Map.of("file",filename.toFile().toString()))).document();
|
||||
|
||||
DocumentData documentData = DocumentDataMapper.toDocumentData(documentGraph);
|
||||
ObjectMapper mapper = ObjectMapperFactory.create();
|
||||
|
||||
@ -29,7 +29,7 @@ public class DocumentGraphMappingTest extends BuildDocumentTest {
|
||||
@SneakyThrows
|
||||
public void testGraphMapping() {
|
||||
|
||||
String filename = "files/syngenta/CustomerFiles/SYNGENTA_EFSA_sanitisation_GFL_v1_moreSections.pdf";
|
||||
String filename = "files/syngenta/CustomerFiles/Fludioxonil_duplicates.pdf";
|
||||
|
||||
Document document = buildGraph(filename);
|
||||
DocumentData documentData = DocumentDataMapper.toDocumentData(document);
|
||||
|
||||
@ -17,8 +17,9 @@ import com.knecon.fforesight.service.layoutparser.processor.python_api.model.tab
|
||||
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.VisualLayoutParsingResponse;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.factory.DocumentGraphFactory;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.visualization.LayoutGridService;
|
||||
import com.knecon.fforesight.service.layoutparser.server.PDFNetInitializer;
|
||||
import com.knecon.fforesight.service.layoutparser.server.BuildDocumentTest;
|
||||
import com.knecon.fforesight.service.layoutparser.server.PDFNetInitializer;
|
||||
import com.knecon.fforesight.service.ocr.v1.api.model.IdpResult;
|
||||
import com.knecon.fforesight.service.viewerdoc.service.PDFTronViewerDocumentService;
|
||||
|
||||
import lombok.SneakyThrows;
|
||||
@ -74,6 +75,7 @@ public class ViewerDocumentTest extends BuildDocumentTest {
|
||||
documentFile,
|
||||
new ImageServiceResponse(),
|
||||
tableResponse,
|
||||
IdpResult.empty(),
|
||||
new VisualLayoutParsingResponse(),
|
||||
Map.of("file", Path.of(fileName).getFileName().toFile().toString()));
|
||||
PDFTronViewerDocumentService viewerDocumentService = new PDFTronViewerDocumentService(null);
|
||||
|
||||
@ -39,6 +39,7 @@ import com.knecon.fforesight.service.layoutparser.processor.python_api.model.tab
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.PageContentExtractor;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.factory.DocumentGraphFactory;
|
||||
import com.knecon.fforesight.service.layoutparser.server.AbstractTest;
|
||||
import com.knecon.fforesight.service.ocr.v1.api.model.IdpResult;
|
||||
|
||||
import lombok.SneakyThrows;
|
||||
|
||||
@ -58,6 +59,7 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
||||
originDocument,
|
||||
new ImageServiceResponse(),
|
||||
tableServiceResponse,
|
||||
IdpResult.empty(),
|
||||
new VisualLayoutParsingResponse(),
|
||||
Map.of("file", "document"));
|
||||
|
||||
@ -103,24 +105,19 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
||||
String textToSearch = "Annex to Regulation 283/2013 Annex to Regulation 284/2013";
|
||||
ClassPathResource pdfFileResource = new ClassPathResource(fileName);
|
||||
|
||||
List<PageContents> textPositionPerPage = PageContentExtractor.getSortedPageContents(fileName);
|
||||
List<PageContents> textPositionPerPage = PageContentExtractor.getDocumentContents(pdfFileResource.getFile(), 4);
|
||||
var textPositions = textPositionPerPage.stream()
|
||||
.flatMap(t -> t.getSortedWords()
|
||||
.flatMap(t -> t.getWords()
|
||||
.stream()
|
||||
.map(Word::toString))
|
||||
.collect(Collectors.joining(" "));
|
||||
assertThat(textPositions.contains(textToSearch)).isFalse();
|
||||
assertThat(textPositions.contains(textToSearch)).isTrue();
|
||||
|
||||
ClassificationDocument classificationDocument = buildClassificationDocument(pdfFileResource.getFile());
|
||||
|
||||
assertThat(classificationDocument.getHeaders()
|
||||
.get(0).getTextBlocks().size()).isEqualTo(3);
|
||||
assertThat(classificationDocument.getHeaders()
|
||||
.get(0).getTextBlocks()
|
||||
.get(0).getWords().size()).isEqualTo(8);
|
||||
assertThat(classificationDocument.getHeaders()
|
||||
.get(0).getTextBlocks()
|
||||
.get(0).toString()).contains(textToSearch);
|
||||
assertThat(classificationDocument.getHeaders().get(0).getTextBlocks().size()).isEqualTo(3);
|
||||
assertThat(classificationDocument.getHeaders().get(0).getTextBlocks().get(0).getWords().size()).isEqualTo(8);
|
||||
assertThat(classificationDocument.getHeaders().get(0).getTextBlocks().get(0).toString()).contains(textToSearch);
|
||||
|
||||
Document document = DocumentGraphFactory.buildDocumentGraph(LayoutParsingType.REDACT_MANAGER_WITHOUT_DUPLICATE_PARAGRAPH, classificationDocument).document();
|
||||
|
||||
@ -216,8 +213,7 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
||||
.stream()
|
||||
.filter(abstractPageBlock -> abstractPageBlock instanceof TablePageBlock))
|
||||
.map(abstractPageBlock -> (TablePageBlock) abstractPageBlock)
|
||||
.toList()
|
||||
.get(0);
|
||||
.toList().get(0);
|
||||
assertThat(table.getColCount()).isEqualTo(6);
|
||||
assertThat(table.getRowCount()).isEqualTo(13);
|
||||
assertThat(table.getRows()
|
||||
@ -246,8 +242,7 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
||||
.stream()
|
||||
.filter(abstractPageBlock -> abstractPageBlock instanceof TablePageBlock))
|
||||
.map(abstractPageBlock -> (TablePageBlock) abstractPageBlock)
|
||||
.toList()
|
||||
.get(0);
|
||||
.toList().get(0);
|
||||
assertThat(firstTable.getColCount()).isEqualTo(8);
|
||||
assertThat(firstTable.getRowCount()).isEqualTo(1);
|
||||
TablePageBlock secondTable = document.getSectionTree().getAllTableOfContentItems()
|
||||
@ -256,12 +251,10 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
||||
.stream()
|
||||
.filter(abstractPageBlock -> abstractPageBlock instanceof TablePageBlock))
|
||||
.map(abstractPageBlock -> (TablePageBlock) abstractPageBlock)
|
||||
.toList()
|
||||
.get(1);
|
||||
.toList().get(1);
|
||||
assertThat(secondTable.getColCount()).isEqualTo(8);
|
||||
assertThat(secondTable.getRowCount()).isEqualTo(2);
|
||||
List<List<Cell>> firstTableHeaderCells = firstTable.getRows()
|
||||
.get(0)
|
||||
List<List<Cell>> firstTableHeaderCells = firstTable.getRows().get(0)
|
||||
.stream()
|
||||
.map(Collections::singletonList)
|
||||
.collect(Collectors.toList());
|
||||
@ -293,8 +286,7 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
||||
.stream()
|
||||
.filter(abstractPageBlock -> abstractPageBlock instanceof TablePageBlock))
|
||||
.map(abstractPageBlock -> (TablePageBlock) abstractPageBlock)
|
||||
.toList()
|
||||
.get(0);
|
||||
.toList().get(0);
|
||||
assertThat(firstTable.getColCount()).isEqualTo(9);
|
||||
assertThat(firstTable.getRowCount()).isEqualTo(5);
|
||||
TablePageBlock secondTable = document.getSectionTree().getAllTableOfContentItems()
|
||||
@ -303,12 +295,10 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
||||
.stream()
|
||||
.filter(abstractPageBlock -> abstractPageBlock instanceof TablePageBlock))
|
||||
.map(abstractPageBlock -> (TablePageBlock) abstractPageBlock)
|
||||
.toList()
|
||||
.get(1);
|
||||
.toList().get(1);
|
||||
assertThat(secondTable.getColCount()).isEqualTo(9);
|
||||
assertThat(secondTable.getRowCount()).isEqualTo(6);
|
||||
List<List<Cell>> firstTableHeaderCells = firstTable.getRows()
|
||||
.get(firstTable.getRowCount() - 1)
|
||||
List<List<Cell>> firstTableHeaderCells = firstTable.getRows().get(firstTable.getRowCount() - 1)
|
||||
.stream()
|
||||
.map(Cell::getHeaderCells)
|
||||
.collect(Collectors.toList());
|
||||
@ -340,8 +330,7 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
||||
.stream()
|
||||
.filter(abstractPageBlock -> abstractPageBlock instanceof TablePageBlock))
|
||||
.map(abstractPageBlock -> (TablePageBlock) abstractPageBlock)
|
||||
.toList()
|
||||
.get(0);
|
||||
.toList().get(0);
|
||||
assertThat(firstTable.getColCount()).isEqualTo(8);
|
||||
assertThat(firstTable.getRowCount()).isEqualTo(1);
|
||||
TablePageBlock secondTable = document.getSectionTree().getAllTableOfContentItems()
|
||||
@ -350,12 +339,10 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
||||
.stream()
|
||||
.filter(abstractPageBlock -> abstractPageBlock instanceof TablePageBlock))
|
||||
.map(abstractPageBlock -> (TablePageBlock) abstractPageBlock)
|
||||
.toList()
|
||||
.get(1);
|
||||
.toList().get(1);
|
||||
assertThat(secondTable.getColCount()).isEqualTo(8);
|
||||
assertThat(secondTable.getRowCount()).isEqualTo(6);
|
||||
List<List<Cell>> firstTableHeaderCells = firstTable.getRows()
|
||||
.get(0)
|
||||
List<List<Cell>> firstTableHeaderCells = firstTable.getRows().get(0)
|
||||
.stream()
|
||||
.map(Collections::singletonList)
|
||||
.collect(Collectors.toList());
|
||||
@ -376,10 +363,10 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
||||
|
||||
validateTableSize(document, 4);
|
||||
|
||||
validateTable(document, 0, 1, 1, 0, 0);
|
||||
validateTable(document, 1, 2, 2, 0, 0);
|
||||
validateTable(document, 2, 4, 19, 12, 0);
|
||||
validateTable(document, 3, 2, 12, 0, 0);
|
||||
validateTable(document, 0, 1, 1, 0);
|
||||
validateTable(document, 1, 2, 2, 0);
|
||||
validateTable(document, 2, 2, 12, 0);
|
||||
validateTable(document, 3, 4, 19, 12);
|
||||
|
||||
}
|
||||
|
||||
@ -393,10 +380,10 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
||||
|
||||
validateTableSize(document, 4);
|
||||
|
||||
validateTable(document, 0, 5, 4, 0, 0);
|
||||
validateTable(document, 1, 5, 15, 14, 0);
|
||||
validateTable(document, 2, 5, 14, 11, 0);
|
||||
validateTable(document, 3, 5, 3, 0, 0);
|
||||
validateTable(document, 0, 5, 4, 0);
|
||||
validateTable(document, 1, 5, 15, 14);
|
||||
validateTable(document, 2, 5, 14, 11);
|
||||
validateTable(document, 3, 5, 3, 0);
|
||||
|
||||
}
|
||||
|
||||
@ -410,7 +397,7 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
||||
|
||||
validateTableSize(document, 1);
|
||||
|
||||
validateTable(document, 0, 8, 8, 0, 0);
|
||||
validateTable(document, 0, 8, 8, 0);
|
||||
|
||||
List<List<String>> values = Arrays.asList(Arrays.asList("Annex point Reference within DAR/RAR",
|
||||
"Author, date",
|
||||
@ -455,10 +442,10 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
||||
|
||||
validateTableSize(document, 4);
|
||||
|
||||
validateTable(document, 0, 3, 2, 0, 0);
|
||||
validateTable(document, 1, 3, 2, 0, 0);
|
||||
validateTable(document, 2, 3, 3, 0, 0);
|
||||
validateTable(document, 3, 3, 3, 0, 0);
|
||||
validateTable(document, 0, 3, 2, 0);
|
||||
validateTable(document, 1, 3, 2, 0);
|
||||
validateTable(document, 2, 3, 3, 0);
|
||||
validateTable(document, 3, 3, 3, 0);
|
||||
|
||||
}
|
||||
|
||||
@ -473,7 +460,7 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
||||
|
||||
validateTableSize(document, 1);
|
||||
|
||||
validateTable(document, 0, 7, 4, 0, 0);
|
||||
validateTable(document, 0, 7, 4, 0);
|
||||
}
|
||||
|
||||
|
||||
@ -486,7 +473,7 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
||||
|
||||
validateTableSize(document, 1);
|
||||
|
||||
validateTable(document, 0, 7, 4, 0, 0);
|
||||
validateTable(document, 0, 7, 4, 0);
|
||||
}
|
||||
|
||||
|
||||
@ -499,12 +486,12 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
||||
|
||||
validateTableSize(document, 6);
|
||||
|
||||
validateTable(document, 0, 2, 1, 0, 0);
|
||||
validateTable(document, 1, 2, 1, 0, 0);
|
||||
validateTable(document, 2, 2, 5, 0, 0);
|
||||
validateTable(document, 3, 2, 5, 0, 0);
|
||||
validateTable(document, 4, 2, 4, 0, 0);
|
||||
validateTable(document, 5, 2, 1, 0, 0);
|
||||
validateTable(document, 0, 2, 1, 0);
|
||||
validateTable(document, 1, 2, 1, 0);
|
||||
validateTable(document, 2, 2, 5, 0);
|
||||
validateTable(document, 3, 2, 5, 0);
|
||||
validateTable(document, 4, 2, 4, 0);
|
||||
validateTable(document, 5, 2, 1, 0);
|
||||
|
||||
}
|
||||
|
||||
@ -518,9 +505,9 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
||||
|
||||
validateTableSize(document, 3);
|
||||
|
||||
validateTable(document, 0, 7, 9, 0, 0);
|
||||
validateTable(document, 1, 2, 1, 0, 0);
|
||||
validateTable(document, 2, 2, 10, 0, 0);
|
||||
validateTable(document, 0, 7, 9, 0);
|
||||
validateTable(document, 1, 2, 1, 0);
|
||||
validateTable(document, 2, 2, 10, 0);
|
||||
|
||||
}
|
||||
|
||||
@ -533,7 +520,7 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
||||
ClassificationDocument document = buildClassificationDocument(pdfFileResource.getFile());
|
||||
|
||||
validateTableSize(document, 1);
|
||||
validateTable(document, 0, 9, 9, 0, 0);
|
||||
validateTable(document, 0, 9, 9, 0);
|
||||
|
||||
}
|
||||
|
||||
@ -547,7 +534,7 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
||||
|
||||
validateTableSize(document, 1);
|
||||
|
||||
validateTable(document, 0, 9, 5, 6, 0);
|
||||
validateTable(document, 0, 9, 5, 6);
|
||||
|
||||
}
|
||||
|
||||
@ -560,7 +547,7 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
||||
ClassificationDocument document = buildClassificationDocument(pdfFileResource.getFile());
|
||||
|
||||
validateTableSize(document, 1);
|
||||
validateTable(document, 0, 9, 6, 7, 0);
|
||||
validateTable(document, 0, 9, 6, 7);
|
||||
|
||||
}
|
||||
|
||||
@ -574,7 +561,7 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
||||
ClassificationDocument document = buildClassificationDocument(pdfFileResource.getFile());
|
||||
|
||||
validateTableSize(document, 1);
|
||||
validateTable(document, 0, 10, 6, 0, 0);
|
||||
validateTable(document, 0, 10, 6, 0);
|
||||
|
||||
}
|
||||
|
||||
@ -588,8 +575,8 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
||||
ClassificationDocument document = buildClassificationDocument(pdfFileResource.getFile());
|
||||
|
||||
validateTableSize(document, 2);
|
||||
validateTable(document, 0, 2, 2, 0, 0);
|
||||
validateTable(document, 1, 1, 1, 0, 0);
|
||||
validateTable(document, 0, 2, 2, 0);
|
||||
validateTable(document, 1, 1, 1, 0);
|
||||
|
||||
}
|
||||
|
||||
@ -604,8 +591,8 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
||||
|
||||
validateTableSize(document, 2);
|
||||
|
||||
validateTable(document, 0, 7, 8, 1, 0);
|
||||
validateTable(document, 1, 7, 8, 1, 0);
|
||||
validateTable(document, 0, 7, 8, 1);
|
||||
validateTable(document, 1, 7, 8, 1);
|
||||
|
||||
}
|
||||
|
||||
@ -620,8 +607,8 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
||||
|
||||
validateTableSize(document, 2);
|
||||
|
||||
validateTable(document, 0, 4, 17, 0, 0);
|
||||
validateTable(document, 1, 7, 12, 0, 0);
|
||||
validateTable(document, 0, 4, 17, 0);
|
||||
validateTable(document, 1, 7, 12, 0);
|
||||
|
||||
}
|
||||
|
||||
@ -636,8 +623,8 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
||||
|
||||
validateTableSize(document, 2);
|
||||
|
||||
validateTable(document, 0, 5, 14, 4, 0);
|
||||
validateTable(document, 1, 7, 12, 0, 0);
|
||||
validateTable(document, 0, 5, 14, 4);
|
||||
validateTable(document, 1, 7, 12, 0);
|
||||
|
||||
}
|
||||
|
||||
@ -651,8 +638,8 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
||||
|
||||
validateTableSize(document, 2);
|
||||
|
||||
validateTable(document, 0, 5, 17, 3, 0);
|
||||
validateTable(document, 1, 5, 16, 2, 0);
|
||||
validateTable(document, 0, 5, 17, 3);
|
||||
validateTable(document, 1, 5, 16, 2);
|
||||
|
||||
}
|
||||
|
||||
@ -666,10 +653,10 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
||||
|
||||
validateTableSize(document, 4);
|
||||
|
||||
validateTable(document, 0, 4, 4, 0, 0);
|
||||
validateTable(document, 1, 1, 1, 0, 0);
|
||||
validateTable(document, 2, 2, 3, 0, 0);
|
||||
validateTable(document, 3, 1, 1, 0, 0);
|
||||
validateTable(document, 0, 4, 4, 0);
|
||||
validateTable(document, 1, 1, 1, 0);
|
||||
validateTable(document, 2, 2, 3, 0);
|
||||
validateTable(document, 3, 1, 1, 0);
|
||||
|
||||
}
|
||||
|
||||
@ -684,7 +671,7 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
||||
|
||||
validateTableSize(document, 1);
|
||||
|
||||
validateTable(document, 0, 11, 8, 0, 0);
|
||||
validateTable(document, 0, 11, 8, 0);
|
||||
|
||||
}
|
||||
|
||||
@ -699,8 +686,8 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
||||
|
||||
validateTableSize(document, 2);
|
||||
|
||||
validateTable(document, 0, 6, 8, 0, 0);
|
||||
validateTable(document, 1, 6, 8, 0, 0);
|
||||
validateTable(document, 0, 6, 8, 0);
|
||||
validateTable(document, 1, 6, 8, 0);
|
||||
|
||||
}
|
||||
|
||||
@ -714,7 +701,7 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
||||
|
||||
validateTableSize(document, 1);
|
||||
|
||||
validateTable(document, 0, 9, 5, 2, 0);
|
||||
validateTable(document, 0, 9, 5, 2);
|
||||
|
||||
}
|
||||
|
||||
@ -728,7 +715,7 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
||||
|
||||
validateTableSize(document, 1);
|
||||
|
||||
validateTable(document, 0, 3, 5, 0, 0);
|
||||
validateTable(document, 0, 3, 5, 0);
|
||||
|
||||
}
|
||||
|
||||
@ -742,7 +729,7 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
||||
|
||||
validateTableSize(document, 1);
|
||||
|
||||
validateTable(document, 0, 6, 8, 0, 0);
|
||||
validateTable(document, 0, 6, 8, 0);
|
||||
}
|
||||
|
||||
|
||||
@ -755,10 +742,10 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
||||
|
||||
validateTableSize(document, 4);
|
||||
|
||||
validateTable(document, 0, 3, 3, 0, 0);
|
||||
validateTable(document, 1, 3, 6, 2, 0);
|
||||
validateTable(document, 2, 3, 3, 1, 0);
|
||||
validateTable(document, 3, 3, 3, 0, 0);
|
||||
validateTable(document, 0, 3, 6, 0);
|
||||
validateTable(document, 1, 3, 3, 0);
|
||||
validateTable(document, 2, 3, 3, 0);
|
||||
validateTable(document, 3, 3, 3, 0);
|
||||
|
||||
}
|
||||
|
||||
@ -772,12 +759,12 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
||||
|
||||
validateTableSize(document, 6);
|
||||
|
||||
validateTable(document, 0, 5, 5, 0, 0);
|
||||
validateTable(document, 1, 5, 6, 0, 0);
|
||||
validateTable(document, 2, 5, 5, 0, 0);
|
||||
validateTable(document, 3, 5, 5, 0, 0);
|
||||
validateTable(document, 4, 5, 5, 0, 0);
|
||||
validateTable(document, 5, 5, 5, 0, 0);
|
||||
validateTable(document, 0, 5, 6, 0);
|
||||
validateTable(document, 1, 5, 5, 0);
|
||||
validateTable(document, 2, 5, 5, 0);
|
||||
validateTable(document, 3, 5, 5, 0);
|
||||
validateTable(document, 4, 5, 5, 0);
|
||||
validateTable(document, 5, 5, 5, 0);
|
||||
|
||||
}
|
||||
|
||||
@ -791,7 +778,7 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
||||
|
||||
validateTableSize(document, 1);
|
||||
|
||||
validateTable(document, 0, 6, 5, 0, 0);
|
||||
validateTable(document, 0, 6, 5, 0);
|
||||
|
||||
}
|
||||
|
||||
@ -805,7 +792,7 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
||||
|
||||
validateTableSize(document, 1);
|
||||
|
||||
validateTable(document, 0, 5, 8, 1, 0);
|
||||
validateTable(document, 0, 5, 8, 0);
|
||||
|
||||
}
|
||||
|
||||
@ -816,13 +803,14 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
||||
ClassPathResource pdfFileResource = new ClassPathResource("files/syngenta/CustomerFiles/SinglePages/T5_Page16_VV-640252.pdf");
|
||||
|
||||
ClassificationDocument document = buildClassificationDocument(pdfFileResource.getFile());
|
||||
|
||||
validateTableSize(document, 5);
|
||||
validateTable(document, 0, 1, 1, 0, 0);
|
||||
validateTable(document, 1, 1, 1, 0, 0);
|
||||
validateTable(document, 2, 1, 1, 0, 0);
|
||||
validateTable(document, 3, 1, 1, 0, 0);
|
||||
validateTable(document, 4, 1, 1, 0, 0);
|
||||
validateTableSize(document, 6);
|
||||
// does not make sense to assert anything here other than that it runs. This is not a Table and completely breaks the current table detection logic
|
||||
// viewerDocumentService.addLayerGroups(pdfFileResource.getFile(), new File("/tmp/cellDebug.pdf"), List.of(document.getLayoutDebugLayer()));
|
||||
// validateTable(document, 0, 1, 1, 0);
|
||||
// validateTable(document, 1, 1, 1, 0);
|
||||
// validateTable(document, 2, 1, 1, 3);
|
||||
// validateTable(document, 3, 1, 1, 0);
|
||||
// validateTable(document, 4, 1, 1, 0);
|
||||
|
||||
}
|
||||
|
||||
@ -836,7 +824,7 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
||||
|
||||
validateTableSize(document, 1);
|
||||
|
||||
validateTable(document, 0, 6, 6, 5, 0);
|
||||
validateTable(document, 0, 6, 6, 5);
|
||||
|
||||
}
|
||||
|
||||
@ -869,7 +857,7 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
||||
}
|
||||
|
||||
|
||||
private void validateTable(ClassificationDocument document, int tableIndex, int colCount, int rowCount, int emptyCellsCountCorrect, int emptyCellsCountIncorrect) {
|
||||
private void validateTable(ClassificationDocument document, int tableIndex, int colCount, int rowCount, int emptyCellsCountCorrect) {
|
||||
|
||||
TablePageBlock table = document.getSectionTree().getAllTableOfContentItems()
|
||||
.stream()
|
||||
@ -877,8 +865,7 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
||||
.stream()
|
||||
.filter(abstractPageBlock -> abstractPageBlock instanceof TablePageBlock))
|
||||
.map(abstractPageBlock -> (TablePageBlock) abstractPageBlock)
|
||||
.toList()
|
||||
.get(tableIndex);
|
||||
.toList().get(tableIndex);
|
||||
|
||||
List<List<Cell>> rows = table.getRows();
|
||||
int emptyCellsFoundFound = rows.stream()
|
||||
@ -891,7 +878,7 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
||||
for (List<Cell> row : table.getRows()) {
|
||||
row.forEach(r -> System.out.println(r.toString()));
|
||||
}
|
||||
assertThat(emptyCellsFoundFound).isEqualTo(emptyCellsCountCorrect + emptyCellsCountIncorrect);
|
||||
assertThat(emptyCellsFoundFound).isEqualTo(emptyCellsCountCorrect);
|
||||
|
||||
assertThat(table.getColCount()).isEqualTo(colCount);
|
||||
assertThat(table.getRowCount()).isEqualTo(rowCount);
|
||||
@ -907,8 +894,7 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
||||
.stream()
|
||||
.filter(abstractPageBlock -> abstractPageBlock instanceof TablePageBlock))
|
||||
.map(abstractPageBlock -> (TablePageBlock) abstractPageBlock)
|
||||
.toList()
|
||||
.get(tableIndex);
|
||||
.toList().get(tableIndex);
|
||||
List<List<Cell>> rows = table.getRows();
|
||||
|
||||
List<Cell> rowsFlattened = rows.stream()
|
||||
|
||||
@ -6,14 +6,10 @@ import java.util.List;
|
||||
|
||||
import org.junit.jupiter.api.Disabled;
|
||||
import org.junit.jupiter.api.Test;
|
||||
import org.springframework.core.io.ClassPathResource;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.GapInformation;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.PageContents;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.PageInformation;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.DividingColumnDetectionService;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.GapDetectionService;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.GapsAcrossLinesService;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.PageInformationService;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.experimental.DividingColumnDetectionService;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.PageContentExtractor;
|
||||
import com.knecon.fforesight.service.layoutparser.server.utils.visualizations.PdfDraw;
|
||||
|
||||
@ -21,32 +17,6 @@ import lombok.SneakyThrows;
|
||||
|
||||
class GapAcrossLinesDetectionServiceTest {
|
||||
|
||||
@Test
|
||||
@Disabled
|
||||
@SneakyThrows
|
||||
public void testGapBasedColumnDetection() {
|
||||
|
||||
String filename = "files/basf/CustomerFiles/invisible_tables_test-two-pages_ocred.pdf";
|
||||
var tmpFileName = "/tmp/" + filename.split("/")[2] + "_COLUMNS.pdf";
|
||||
System.out.println("start TextPosition extraction");
|
||||
long start = System.currentTimeMillis();
|
||||
List<PageInformation> pageInformations = PageContentExtractor.getSortedPageContents(filename).stream().map(PageInformationService::build).toList();
|
||||
List<List<Rectangle2D>> columnsPerPage = new LinkedList<>();
|
||||
System.out.printf("Finished TextPosition Extraction in %d ms%n", System.currentTimeMillis() - start);
|
||||
System.out.println("start column detection");
|
||||
start = System.currentTimeMillis();
|
||||
for (PageInformation pageInformation : pageInformations) {
|
||||
GapInformation gapInformation = GapDetectionService.findGapsInLines(pageInformation.getPageContents().getSortedWords(), pageInformation.getMainBodyTextFrame());
|
||||
columnsPerPage.add(GapsAcrossLinesService.detectXGapsAcrossLines(gapInformation, pageInformation.getMainBodyTextFrame()));
|
||||
}
|
||||
System.out.printf("Finished column detection in %d ms%n", System.currentTimeMillis() - start);
|
||||
System.out.println("start draw rectangles");
|
||||
start = System.currentTimeMillis();
|
||||
PdfDraw.drawRectanglesPerPage(filename, columnsPerPage, tmpFileName);
|
||||
System.out.printf("Finished drawing rectangles in %d ms%n", System.currentTimeMillis() - start);
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
@Disabled
|
||||
@SneakyThrows
|
||||
@ -56,7 +26,7 @@ class GapAcrossLinesDetectionServiceTest {
|
||||
var tmpFileName = "/tmp/" + filename.split("/")[2] + "_COLUMNS.pdf";
|
||||
System.out.println("start TextPosition extraction");
|
||||
long start = System.currentTimeMillis();
|
||||
List<PageContents> sortedTextPositionSequencesPerPage = PageContentExtractor.getSortedPageContents(filename);
|
||||
List<PageContents> sortedTextPositionSequencesPerPage = PageContentExtractor.getDocumentContents(new ClassPathResource(filename).getFile(), 4);
|
||||
List<List<Rectangle2D>> columnsPerPage = new LinkedList<>();
|
||||
System.out.printf("Finished TextPosition Extraction in %d ms%n", System.currentTimeMillis() - start);
|
||||
System.out.println("start column detection");
|
||||
|
||||
@ -1,66 +0,0 @@
|
||||
package com.knecon.fforesight.service.layoutparser.server.services;
|
||||
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.nio.file.Path;
|
||||
import java.util.Collection;
|
||||
import java.util.Collections;
|
||||
import java.util.List;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.PageInformation;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.Word;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.InvisibleTableDetectionService;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.PageInformationService;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.PageContentExtractor;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations;
|
||||
import com.knecon.fforesight.service.layoutparser.server.utils.visualizations.PdfDraw;
|
||||
|
||||
import lombok.SneakyThrows;
|
||||
|
||||
class InvisibleTableDetectionServiceTest {
|
||||
|
||||
@Test
|
||||
// @Disabled
|
||||
@SneakyThrows
|
||||
public void detectInvisibleTableTest() {
|
||||
|
||||
String fileName = "files/basf/CustomerFiles/invisible_tables_test-two-pages_ocred.pdf";
|
||||
var tmpFileName = Path.of("/tmp/").resolve(Path.of(fileName).getFileName() + "_TABLE.pdf").toString();
|
||||
List<PageInformation> pageContents = PageContentExtractor.getSortedPageContents(fileName)
|
||||
.stream()
|
||||
.map(PageInformationService::build)
|
||||
.collect(Collectors.toList());
|
||||
|
||||
int pageNumber = 1;
|
||||
Rectangle2D tableBBox = pageContents.get(0).getPageContents().getSortedWords().subList(45, 152)
|
||||
.stream()
|
||||
.map(Word::getBBox)
|
||||
.map(this::mirrorY)
|
||||
.collect(RectangleTransformations.collectBBox());
|
||||
|
||||
List<Word> words = pageContents.get(0).getPageContents().getSortedWords()
|
||||
.stream()
|
||||
.filter(textPositionSequence -> tableBBox.contains(mirrorY(textPositionSequence.getBBox())))
|
||||
.toList();
|
||||
|
||||
var table = InvisibleTableDetectionService.detectTable(words, tableBBox);
|
||||
|
||||
PdfDraw.drawRectanglesPerPage(fileName,
|
||||
List.of(table.stream()
|
||||
.flatMap(Collection::stream)
|
||||
.toList(), Collections.emptyList()),
|
||||
tmpFileName);
|
||||
}
|
||||
|
||||
|
||||
private Rectangle2D mirrorY(Rectangle2D rectangle2D) {
|
||||
|
||||
if (rectangle2D.getHeight() >= 0) {
|
||||
return rectangle2D;
|
||||
}
|
||||
return new Rectangle2D.Double(rectangle2D.getX(), rectangle2D.getY() + rectangle2D.getHeight(), rectangle2D.getWidth(), -rectangle2D.getHeight());
|
||||
}
|
||||
|
||||
}
|
||||
@ -5,6 +5,7 @@ import java.util.List;
|
||||
|
||||
import org.junit.jupiter.api.Disabled;
|
||||
import org.junit.jupiter.api.Test;
|
||||
import org.springframework.core.io.ClassPathResource;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.PageContents;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.PageContentExtractor;
|
||||
@ -20,7 +21,7 @@ class MainBodyTextFrameExtractionServiceTest {
|
||||
|
||||
String fileName = "files/basf/CustomerFiles/invisible_tables_test-two-pages_ocred.pdf";
|
||||
String tmpFileName = Path.of("/tmp/").resolve(Path.of(fileName).getFileName() + "_MAIN_BODY.pdf").toString();
|
||||
List<PageContents> sortedTextPositionSequence = PageContentExtractor.getSortedPageContents(fileName);
|
||||
List<PageContents> sortedTextPositionSequence = PageContentExtractor.getDocumentContents(new ClassPathResource(fileName).getFile(), 4);
|
||||
|
||||
|
||||
}
|
||||
|
||||
@ -4,6 +4,7 @@ import java.nio.file.Path;
|
||||
import java.util.List;
|
||||
|
||||
import org.junit.jupiter.api.Test;
|
||||
import org.springframework.core.io.ClassPathResource;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.PageContents;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.Word;
|
||||
@ -21,11 +22,11 @@ class PageContentExtractorTest {
|
||||
String fileName = "files/syngenta/CustomerFiles/Documine/Flora/SOLICITA_VICTRATO-GOLD-II_Item 21_Mutacao_Genica.pdf";
|
||||
var tmpFileName = Path.of("/tmp/").resolve(Path.of(fileName).getFileName() + "_TEXT_POSITION_SEQUENCES.pdf").toString();
|
||||
|
||||
List<PageContents> textPositionPerPage = PageContentExtractor.getSortedPageContents(fileName);
|
||||
List<PageContents> textPositionPerPage = PageContentExtractor.getDocumentContents(new ClassPathResource(fileName).getFile(), 4);
|
||||
|
||||
PdfDraw.drawRectanglesPerPageNumberedByLine(fileName,
|
||||
textPositionPerPage.stream()
|
||||
.map(t -> t.getSortedWords()
|
||||
.map(t -> t.getWords()
|
||||
.stream()
|
||||
.map(Word::getBBoxPdf)
|
||||
.map(List::of)
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Loading…
x
Reference in New Issue
Block a user