RED-8670: add table detection from idp result
* some 'slight' refactoring
This commit is contained in:
parent
b5152112ee
commit
3a700aecd4
@ -10,38 +10,23 @@ import lombok.NonNull;
|
||||
@Builder
|
||||
@Schema(description = "Object containing all storage paths the service needs to know.")
|
||||
public record LayoutParsingRequest(
|
||||
@Schema(description = "Enum specifying the type of layout parsing to be performed.", allowableValues = "{RedactManager, DocuMine, TAAS}")//
|
||||
@NonNull LayoutParsingType layoutParsingType,
|
||||
@Schema(description = "Enum specifying the type of layout parsing to be performed.", allowableValues = "{RedactManager, DocuMine, TAAS}") @NonNull LayoutParsingType layoutParsingType,
|
||||
@Schema(description = "General purpose identifiers. They are not changed by the service at all and are returned as is in the response queue.") Map<String, String> identifier,
|
||||
@Schema(description = "Path to the original PDF file.") @NonNull String originFileStorageId,
|
||||
|
||||
@Schema(description = "General purpose identifiers. They are not changed by the service at all and are returned as is in the response queue.")//
|
||||
Map<String, String> identifier,
|
||||
@Schema(description = "Optional Path to the table extraction file.") Optional<String> tablesFileStorageId,
|
||||
@Schema(description = "Optional Path to the image classification file.") Optional<String> imagesFileStorageId,
|
||||
@Schema(description = "Path where the IDP Result File is stored.") Optional<String> idpResultStorageId,
|
||||
@Schema(description = "Optional Path to the the visual layout parsing service file") Optional<String> visualLayoutParsingFileId,
|
||||
|
||||
@Schema(description = "Path to the original PDF file.")//
|
||||
@NonNull String originFileStorageId,//
|
||||
|
||||
@Schema(description = "Optional Path to the table extraction file.")//
|
||||
Optional<String> tablesFileStorageId,//
|
||||
@Schema(description = "Optional Path to the image classification file.")//
|
||||
Optional<String> imagesFileStorageId,//
|
||||
|
||||
@Schema(description = "Optional Path to the the visual layout parsing service file") Optional<String> visualLayoutParsingFileId,//
|
||||
|
||||
@Schema(description = "Path where the Document Structure File will be stored.")//
|
||||
@NonNull String structureFileStorageId,//
|
||||
@Schema(description = "Path where the Research Data File will be stored.")//
|
||||
String researchDocumentStorageId,//
|
||||
@Schema(description = "Path where the Document Text File will be stored.")//
|
||||
@NonNull String textBlockFileStorageId,//
|
||||
@Schema(description = "Path where the Document Positions File will be stored.")//
|
||||
@NonNull String positionBlockFileStorageId,//
|
||||
@Schema(description = "Path where the Document Pages File will be stored.")//
|
||||
@NonNull String pageFileStorageId,//
|
||||
@Schema(description = "Path where the Document Markdown File will be stored.")//
|
||||
Optional<String> documentMarkdownFileStorageId,//
|
||||
@Schema(description = "Path where the Simplified Text File will be stored.")//
|
||||
@NonNull String simplifiedTextStorageId,//
|
||||
@Schema(description = "Path where the Viewer Document PDF will be stored.")//
|
||||
@NonNull String viewerDocumentStorageId
|
||||
@Schema(description = "Path where the Document Structure File will be stored.") @NonNull String structureFileStorageId,
|
||||
@Schema(description = "Path where the Research Data File will be stored.") String researchDocumentStorageId,
|
||||
@Schema(description = "Path where the Document Text File will be stored.") @NonNull String textBlockFileStorageId,
|
||||
@Schema(description = "Path where the Document Positions File will be stored.") @NonNull String positionBlockFileStorageId,
|
||||
@Schema(description = "Path where the Document Pages File will be stored.") @NonNull String pageFileStorageId,
|
||||
@Schema(description = "Path where the Document Markdown File will be stored.") Optional<String> documentMarkdownFileStorageId,
|
||||
@Schema(description = "Path where the Simplified Text File will be stored.") @NonNull String simplifiedTextStorageId,
|
||||
@Schema(description = "Path where the Viewer Document PDF will be stored.") @NonNull String viewerDocumentStorageId
|
||||
) {
|
||||
|
||||
}
|
||||
|
||||
@ -23,6 +23,8 @@ dependencies {
|
||||
}
|
||||
implementation("com.iqser.red.commons:storage-commons:2.50.0")
|
||||
|
||||
api("com.knecon.fforesight:azure-ocr-service-api:0.23.0")
|
||||
|
||||
implementation("org.apache.pdfbox:pdfbox:${pdfBoxVersion}")
|
||||
implementation("org.apache.pdfbox:pdfbox-tools:${pdfBoxVersion}")
|
||||
implementation("com.fasterxml.jackson.module:jackson-module-afterburner:${jacksonVersion}")
|
||||
|
||||
@ -17,4 +17,6 @@ public class LayoutParserSettings {
|
||||
|
||||
boolean debug;
|
||||
LayoutParsingType layoutParsingTypeOverride;
|
||||
String pdftronLicense;
|
||||
int extractionThreads = 1;
|
||||
}
|
||||
|
||||
@ -14,39 +14,39 @@ import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.concurrent.atomic.AtomicReference;
|
||||
import java.util.function.Function;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import org.apache.pdfbox.Loader;
|
||||
import org.apache.pdfbox.pdmodel.PDDocument;
|
||||
import org.apache.pdfbox.pdmodel.PDPage;
|
||||
import org.apache.pdfbox.pdmodel.common.PDRectangle;
|
||||
import org.apache.pdfbox.pdmodel.documentinterchange.markedcontent.PDMarkedContent;
|
||||
import org.springframework.beans.factory.annotation.Value;
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import com.iqser.red.service.redaction.v1.server.mapper.DocumentDataMapper;
|
||||
import com.iqser.red.service.redaction.v1.server.model.document.nodes.ImageType;
|
||||
import com.iqser.red.service.redaction.v1.server.model.document.nodes.NodeType;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingFinishedEvent;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingRequest;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.Character;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.DocumentWithVisualization;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.RedTextPosition;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextDirection;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.classification.ClassificationService;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.mapper.MarkdownMapper;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.docstrum.service.ReadingOrderService;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationDocument;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.DocumentWithVisualization;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.PageContents;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.image.ClassifiedImage;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.outline.OutlineExtractorService;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.outline.SectionTree;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.outline.SectionTreeBuilderService;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.outline.SectionTreeEnhancementService;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.outline.SectionTree;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.Cell;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.TablePageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.RedTextPosition;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextDirection;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.Word;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.python_api.adapter.CvTableParsingAdapter;
|
||||
@ -56,24 +56,26 @@ import com.knecon.fforesight.service.layoutparser.processor.python_api.model.ima
|
||||
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableCells;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableServiceResponse;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.VisualLayoutParsingResponse;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.PageContentExtractor;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.RulingCleaningService;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.SectionsBuilderService;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.SimplifiedSectionTextService;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.TableExtractionService;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.tables.TableExtractionService;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.TextRulingsClassifier;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.blockification.BlockificationPostprocessingService;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.blockification.DocstrumBlockificationService;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.blockification.DocuMineBlockificationService;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.blockification.RedactManagerBlockificationService;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.blockification.BlockificationService;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.classification.ClassificationService;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.factory.DocumentGraphFactory;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.graphics.Box;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.graphics.GraphicExtractorService;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.mapper.MarkdownMapper;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.mapper.TaasDocumentDataMapper;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.parsing.PDFLinesTextStripper;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.visualization.LayoutGridService;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.MarkedContentUtils;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.PageInformation;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.PageInformation;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.TextPositionOperations;
|
||||
import com.knecon.fforesight.service.ocr.v1.api.model.IdpResult;
|
||||
import com.knecon.fforesight.service.ocr.v1.api.model.Table;
|
||||
|
||||
import io.micrometer.observation.Observation;
|
||||
import io.micrometer.observation.ObservationRegistry;
|
||||
@ -98,10 +100,8 @@ public class LayoutParsingPipeline {
|
||||
final SimplifiedSectionTextService simplifiedSectionTextService;
|
||||
final RulingCleaningService rulingCleaningService;
|
||||
final TableExtractionService tableExtractionService;
|
||||
final DocuMineBlockificationService docuMineBlockificationService;
|
||||
final RedactManagerBlockificationService redactManagerBlockificationService;
|
||||
final BlockificationService blockificationService;
|
||||
final BlockificationPostprocessingService blockificationPostprocessingService;
|
||||
final DocstrumBlockificationService docstrumBlockificationService;
|
||||
final LayoutGridService layoutGridService;
|
||||
final ObservationRegistry observationRegistry;
|
||||
final VisualLayoutParsingAdapter visualLayoutParsingAdapter;
|
||||
@ -111,11 +111,11 @@ public class LayoutParsingPipeline {
|
||||
final SectionTreeEnhancementService sectionTreeEnhancementService;
|
||||
final LayoutParserSettings settings;
|
||||
final ClassificationService classificationService;
|
||||
final ReadingOrderService readingOrderService;
|
||||
|
||||
@Value("${LAYOUT_PARSER_VERSION:}")
|
||||
private String layoutParserVersion;
|
||||
|
||||
|
||||
public LayoutParsingFinishedEvent parseLayoutAndSaveFilesToStorage(LayoutParsingRequest layoutParsingRequest) throws IOException {
|
||||
|
||||
long start = System.currentTimeMillis();
|
||||
@ -134,14 +134,16 @@ public class LayoutParsingPipeline {
|
||||
TableServiceResponse tableServiceResponse = layoutParsingRequest.tablesFileStorageId()
|
||||
.map(layoutParsingStorageService::getTablesFile)
|
||||
.orElse(new TableServiceResponse());
|
||||
IdpResult idpResult = layoutParsingRequest.idpResultStorageId()
|
||||
.map(layoutParsingStorageService::getIdpResultFile).orElse(IdpResult.empty());
|
||||
|
||||
LayoutParsingType layoutParsingType = settings.getLayoutParsingTypeOverride() == null //
|
||||
? layoutParsingRequest.layoutParsingType() : settings.getLayoutParsingTypeOverride();
|
||||
LayoutParsingType layoutParsingType = settings.getLayoutParsingTypeOverride() == null ? layoutParsingRequest.layoutParsingType() : settings.getLayoutParsingTypeOverride();
|
||||
|
||||
ClassificationDocument classificationDocument = parseLayout(layoutParsingType,
|
||||
originFile,
|
||||
imageServiceResponse,
|
||||
tableServiceResponse,
|
||||
idpResult,
|
||||
visualLayoutParsingResponse,
|
||||
layoutParsingRequest.identifier());
|
||||
|
||||
@ -159,7 +161,8 @@ public class LayoutParsingPipeline {
|
||||
if (layoutParsingRequest.documentMarkdownFileStorageId()
|
||||
.isPresent()) {
|
||||
layoutParsingStorageService.storeMarkdownFile(layoutParsingRequest.documentMarkdownFileStorageId()
|
||||
.get(), new MarkdownMapper().toMarkdownContent(documentWithVisualization.document()));
|
||||
.get(),
|
||||
new MarkdownMapper().toMarkdownContent(documentWithVisualization.document()));
|
||||
}
|
||||
layoutParsingStorageService.storeSimplifiedText(layoutParsingRequest, simplifiedSectionTextService.toSimplifiedText(documentWithVisualization.document()));
|
||||
layoutParsingStorageService.storeViewerDocument(layoutParsingRequest, viewerDocumentFile);
|
||||
@ -237,15 +240,22 @@ public class LayoutParsingPipeline {
|
||||
File originFile,
|
||||
ImageServiceResponse imageServiceResponse,
|
||||
TableServiceResponse tableServiceResponse,
|
||||
IdpResult idpResult,
|
||||
VisualLayoutParsingResponse visualLayoutParsingResponse,
|
||||
Map<String, String> identifier) {
|
||||
|
||||
PDDocument originDocument = openDocument(originFile);
|
||||
addNumberOfPagesToTrace(originDocument.getNumberOfPages(), Files.size(originFile.toPath()));
|
||||
PageContentExtractor extractor = new PageContentExtractor(originFile, settings.getExtractionThreads());
|
||||
extractor.startAsync();
|
||||
int pageCount = extractor.getPageCount();
|
||||
addNumberOfPagesToTrace(pageCount, Files.size(originFile.toPath()));
|
||||
|
||||
Map<Integer, List<TableCells>> pdfTableCells = cvTableParsingAdapter.buildCvParsedTablesPerPage(tableServiceResponse);
|
||||
Map<Integer, List<ClassifiedImage>> pdfImages = imageServiceResponseAdapter.buildClassifiedImagesPerPage(imageServiceResponse);
|
||||
Map<Integer, List<ClassifiedImage>> pdfImages = imageServiceResponseAdapter.buildClassifiedImagesPerPage(imageServiceResponse, idpResult);
|
||||
Map<Integer, List<ClassifiedImage>> signatures = visualLayoutParsingAdapter.buildExtractedSignaturesPerPage(visualLayoutParsingResponse);
|
||||
Function<Table, Integer> pageNumberExtractor = table -> table.bboxes().get(0).pageNumber();
|
||||
Map<Integer, List<Table>> idpTablesPerPage = idpResult.tables()
|
||||
.stream()
|
||||
.collect(Collectors.groupingBy(pageNumberExtractor));
|
||||
|
||||
ClassificationDocument classificationDocument = new ClassificationDocument();
|
||||
|
||||
@ -255,32 +265,20 @@ public class LayoutParsingPipeline {
|
||||
|
||||
List<ClassificationPage> classificationPages = new ArrayList<>();
|
||||
|
||||
classificationDocument.setOutlineObjectTree(outlineExtractorService.getOutlineObjectTree(originDocument));
|
||||
|
||||
long pageCount = originDocument.getNumberOfPages();
|
||||
classificationDocument.setOutlineObjectTree(outlineExtractorService.getOutlineObjectTree(originFile));
|
||||
|
||||
for (int pageNumber = 1; pageNumber <= pageCount; pageNumber++) {
|
||||
|
||||
if (pageNumber % 100 == 0) {
|
||||
// re-open document every once in a while to save on RAM. This has no significant performance impact.
|
||||
// This is due to PDFBox caching all images and some other stuff with Soft References. This dereferences them and forces the freeing of memory.
|
||||
originDocument.close();
|
||||
originDocument = openDocument(originFile);
|
||||
}
|
||||
|
||||
PageContents pageContents = extractor.awaitPageContents(pageNumber);
|
||||
if (pageNumber % 100 == 0 || pageNumber == pageCount || pageNumber == 1) {
|
||||
log.info("Extracting text on Page {} for {}", pageNumber, identifier);
|
||||
log.info("Processing text on Page {} for {}", pageNumber, identifier);
|
||||
}
|
||||
|
||||
classificationDocument.setPages(classificationPages);
|
||||
PDFLinesTextStripper stripper = new PDFLinesTextStripper();
|
||||
PDPage pdPage = originDocument.getPage(pageNumber - 1);
|
||||
stripper.setPageNumber(pageNumber);
|
||||
stripper.setStartPage(pageNumber);
|
||||
stripper.setEndPage(pageNumber);
|
||||
stripper.setPdpage(pdPage);
|
||||
stripper.getText(originDocument);
|
||||
List<Word> words = stripper.getWords();
|
||||
|
||||
List<Word> words = pageContents.getWords();
|
||||
List<Ruling> rulings = pageContents.getRulings();
|
||||
PageInformation pageInformation = pageContents.getPageInformation();
|
||||
|
||||
// rotateDirAdjExactly(words, pdPage); // works really well for many highly rotated documents (e.g. VV-331340.pdf), but it decreases the headline performance by 1.3%, so I am leaving it out for now
|
||||
|
||||
@ -291,39 +289,23 @@ public class LayoutParsingPipeline {
|
||||
}
|
||||
classificationDocument.getLayoutDebugLayer().addTextVisualizations(words, pageNumber);
|
||||
|
||||
PDRectangle pdr = pdPage.getMediaBox();
|
||||
|
||||
List<Ruling> rulings = stripper.getRulings();
|
||||
classificationDocument.getLayoutDebugLayer().addRulingVisualization(rulings, pageNumber);
|
||||
CleanRulings cleanRulings = rulingCleaningService.deduplicateAndStraightenRulings(pdfTableCells.get(pageNumber), rulings);
|
||||
|
||||
PageInformation pageInformation = PageInformation.fromPDPage(pageNumber, pdPage);
|
||||
List<Cell> emptyTableCells = TableExtractionService.findCells(cleanRulings.getHorizontals(), cleanRulings.getVerticals(), pageInformation);
|
||||
classificationDocument.getLayoutDebugLayer().addCellVisualizations(emptyTableCells, pageNumber);
|
||||
|
||||
classificationDocument.getLayoutDebugLayer().addCellVisualizations(emptyTableCells, pageNumber, null);
|
||||
TextRulingsClassifier.classifyUnderlinedAndStrikethroughText(words, cleanRulings);
|
||||
|
||||
List<Box> graphics = graphicExtractorService.extractPathElementGraphics(originDocument, pdPage, pageNumber, cleanRulings, stripper.getWords(), false);
|
||||
List<TablePageBlock> tables = tableExtractionService.extractTables(emptyTableCells, words, pageInformation, idpTablesPerPage.get(pageNumber), layoutParsingType, classificationDocument.getLayoutDebugLayer());
|
||||
|
||||
pdfImages.computeIfAbsent(pageNumber, x -> new ArrayList<>())
|
||||
.addAll(graphics.stream()
|
||||
.map(g -> new ClassifiedImage(new Rectangle2D.Double(g.x1, g.y1, g.width(), g.height()),
|
||||
ImageType.GRAPHIC,
|
||||
false,
|
||||
stripper.getPageNumber(),
|
||||
""))
|
||||
.toList());
|
||||
List<ClassifiedImage> graphics = graphicExtractorService.extractPathElementGraphics(pageContents.getGraphicBBoxes(), pageNumber, cleanRulings);
|
||||
pdfImages.computeIfAbsent(pageNumber, x -> new ArrayList<>()).addAll(graphics);
|
||||
|
||||
ClassificationPage classificationPage = switch (layoutParsingType) {
|
||||
case REDACT_MANAGER_OLD -> redactManagerBlockificationService.blockify(stripper.getWords(), cleanRulings, classificationDocument.getLayoutDebugLayer());
|
||||
case DOCUMINE_OLD -> docuMineBlockificationService.blockify(words, cleanRulings);
|
||||
case DOCUMINE, REDACT_MANAGER, REDACT_MANAGER_PARAGRAPH_DEBUG, REDACT_MANAGER_WITHOUT_DUPLICATE_PARAGRAPH ->
|
||||
docstrumBlockificationService.blockify(words, cleanRulings, true, classificationDocument.getLayoutDebugLayer(), layoutParsingType);
|
||||
case CLARIFYND, CLARIFYND_PARAGRAPH_DEBUG ->
|
||||
docstrumBlockificationService.blockify(words, cleanRulings, false, classificationDocument.getLayoutDebugLayer(), layoutParsingType);
|
||||
};
|
||||
List<TextPageBlock> textBlocks = blockificationService.blockify(layoutParsingType, words, cleanRulings, classificationDocument.getLayoutDebugLayer());
|
||||
|
||||
updateClassificationPage(pdPage, pdr, classificationPage, cleanRulings, pageNumber, pageInformation);
|
||||
List<AbstractPageBlock> blocks = readingOrderService.resolve(textBlocks, tables);
|
||||
|
||||
ClassificationPage classificationPage = new ClassificationPage(blocks, pageInformation, cleanRulings);
|
||||
|
||||
blockificationPostprocessingService.findHeadlinesFromOutline(classificationDocument, pageNumber, classificationPage, pageInformation);
|
||||
|
||||
@ -345,16 +327,12 @@ public class LayoutParsingPipeline {
|
||||
}
|
||||
}
|
||||
|
||||
tableExtractionService.extractTables(emptyTableCells, classificationPage);
|
||||
|
||||
buildPageStatistics(classificationPage);
|
||||
increaseDocumentStatistics(classificationPage, classificationDocument);
|
||||
|
||||
classificationPages.add(classificationPage);
|
||||
}
|
||||
|
||||
originDocument.close();
|
||||
|
||||
classificationService.classify(classificationDocument, layoutParsingType, identifier);
|
||||
|
||||
SectionTree sectionTree = sectionTreeBuilderService.createSectionTree(classificationDocument);
|
||||
@ -371,24 +349,6 @@ public class LayoutParsingPipeline {
|
||||
}
|
||||
|
||||
|
||||
private static void updateClassificationPage(PDPage pdPage,
|
||||
PDRectangle pdr,
|
||||
ClassificationPage classificationPage,
|
||||
CleanRulings cleanRulings,
|
||||
int pageNumber,
|
||||
PageInformation pageInformation) {
|
||||
|
||||
int rotation = pdPage.getRotation();
|
||||
boolean isLandscape = pdr.getWidth() > pdr.getHeight() && (rotation == 0 || rotation == 180) || pdr.getHeight() > pdr.getWidth() && (rotation == 90 || rotation == 270);
|
||||
classificationPage.setCleanRulings(cleanRulings);
|
||||
classificationPage.setRotation(rotation);
|
||||
classificationPage.setLandscape(isLandscape);
|
||||
classificationPage.setPageNumber(pageNumber);
|
||||
classificationPage.setPageWidth((float) pageInformation.width());
|
||||
classificationPage.setPageHeight((float) pageInformation.height());
|
||||
}
|
||||
|
||||
|
||||
private static void rotateDirAdjExactly(List<Word> words, PDPage pdPage) {
|
||||
|
||||
for (TextDirection dir : TextDirection.values()) {
|
||||
|
||||
@ -25,6 +25,7 @@ import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsi
|
||||
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.image.ImageServiceResponse;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableServiceResponse;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.VisualLayoutParsingResponse;
|
||||
import com.knecon.fforesight.service.ocr.v1.api.model.IdpResult;
|
||||
import com.knecon.fforesight.service.viewerdoc.service.ViewerDocVersioningUtility;
|
||||
import com.knecon.fforesight.tenantcommons.TenantContext;
|
||||
|
||||
@ -95,7 +96,23 @@ public class LayoutParsingStorageService {
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
|
||||
@SneakyThrows
|
||||
public IdpResult getIdpResultFile(String storageId) {
|
||||
|
||||
if (!storageService.objectExists(TenantContext.getTenantId(), storageId)) {
|
||||
return IdpResult.empty();
|
||||
}
|
||||
try (var idpResultStream = getObject(storageId)) {
|
||||
|
||||
IdpResult idpResult = objectMapper.readValue(idpResultStream, IdpResult.class);
|
||||
idpResultStream.close();
|
||||
return idpResult;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
public VisualLayoutParsingResponse getVisualLayoutParsingFile(String storageId) {
|
||||
|
||||
try (InputStream inputStream = getObject(storageId)) {
|
||||
|
||||
@ -1,9 +1,7 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.docstrum;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.EnumMap;
|
||||
import java.util.List;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
@ -16,10 +14,8 @@ import com.knecon.fforesight.service.layoutparser.processor.docstrum.service.Rea
|
||||
import com.knecon.fforesight.service.layoutparser.processor.docstrum.service.SpacingService;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.docstrum.service.ZoneBuilderService;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.RedTextPosition;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextDirection;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.Word;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.visualization.LayoutDebugLayer;
|
||||
|
||||
import lombok.RequiredArgsConstructor;
|
||||
|
||||
@ -27,7 +23,6 @@ import lombok.RequiredArgsConstructor;
|
||||
@RequiredArgsConstructor
|
||||
public class DocstrumSegmentationService {
|
||||
|
||||
public static final double SAME_DIRECTION_THRESHOLD = 0.9;
|
||||
private final NearestNeighbourService nearestNeighbourService;
|
||||
private final SpacingService spacingService;
|
||||
private final LineBuilderService lineBuilderService;
|
||||
@ -35,52 +30,27 @@ public class DocstrumSegmentationService {
|
||||
private final ReadingOrderService readingOrderService;
|
||||
|
||||
|
||||
public List<Zone> segmentPage(List<Word> textPositions, boolean xyOrder, CleanRulings usedRulings, LayoutDebugLayer visualizations) {
|
||||
public List<Zone> segmentPage(List<Word> words, boolean xyOrder, CleanRulings usedRulings) {
|
||||
|
||||
EnumMap<TextDirection, Integer> directionCounts = new EnumMap<>(TextDirection.class);
|
||||
|
||||
List<Zone> newZones = computeZones(textPositions, usedRulings, visualizations, TextDirection.ZERO);
|
||||
directionCounts.put(TextDirection.ZERO, newZones.size());
|
||||
List<Zone> newZones = computeZones(words, usedRulings, TextDirection.ZERO);
|
||||
List<Zone> zones = new ArrayList<>(newZones);
|
||||
|
||||
newZones = computeZones(textPositions, usedRulings, visualizations, TextDirection.QUARTER_CIRCLE);
|
||||
directionCounts.put(TextDirection.QUARTER_CIRCLE, newZones.size());
|
||||
newZones = computeZones(words, usedRulings, TextDirection.QUARTER_CIRCLE);
|
||||
zones.addAll(newZones);
|
||||
|
||||
newZones = computeZones(textPositions, usedRulings, visualizations, TextDirection.HALF_CIRCLE);
|
||||
directionCounts.put(TextDirection.HALF_CIRCLE, newZones.size());
|
||||
newZones = computeZones(words, usedRulings, TextDirection.HALF_CIRCLE);
|
||||
zones.addAll(newZones);
|
||||
|
||||
newZones = computeZones(textPositions, usedRulings, visualizations, TextDirection.THREE_QUARTER_CIRCLE);
|
||||
directionCounts.put(TextDirection.THREE_QUARTER_CIRCLE, newZones.size());
|
||||
newZones = computeZones(words, usedRulings, TextDirection.THREE_QUARTER_CIRCLE);
|
||||
zones.addAll(newZones);
|
||||
|
||||
return readingOrderService.resolve(zones, xyOrder, mostSameDirection(directionCounts));
|
||||
return readingOrderService.resolve(zones, xyOrder);
|
||||
}
|
||||
|
||||
|
||||
private boolean mostSameDirection(EnumMap<TextDirection, Integer> directionCounts) {
|
||||
private List<Zone> computeZones(List<Word> words, CleanRulings rulings, TextDirection direction) {
|
||||
|
||||
int total = directionCounts.values()
|
||||
.stream()
|
||||
.mapToInt(i -> i).sum();
|
||||
|
||||
if ((double) directionCounts.get(TextDirection.ZERO) / total > SAME_DIRECTION_THRESHOLD) {
|
||||
return true;
|
||||
} else if ((double) directionCounts.get(TextDirection.QUARTER_CIRCLE) / total > SAME_DIRECTION_THRESHOLD) {
|
||||
return true;
|
||||
} else if ((double) directionCounts.get(TextDirection.HALF_CIRCLE) / total > SAME_DIRECTION_THRESHOLD) {
|
||||
return true;
|
||||
} else if ((double) directionCounts.get(TextDirection.THREE_QUARTER_CIRCLE) / total > SAME_DIRECTION_THRESHOLD) {
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
private List<Zone> computeZones(List<Word> textPositions, CleanRulings rulings, LayoutDebugLayer visualizations, TextDirection direction) {
|
||||
|
||||
List<Character> characters = textPositions.stream()
|
||||
List<Character> characters = words.stream()
|
||||
.filter(t -> t.getDir() == direction)
|
||||
.map(Word::getCharacters)
|
||||
.flatMap(List::stream)
|
||||
|
||||
@ -1,7 +1,6 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.docstrum.model;
|
||||
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.util.Comparator;
|
||||
import java.util.List;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations;
|
||||
@ -25,8 +24,6 @@ public abstract class BoundingBox {
|
||||
// Also, these are definitely correct and should be used whenever possible.
|
||||
protected Rectangle2D bBoxPdf;
|
||||
|
||||
protected static final float VERTICAL_COMPARISON_THRESHOLD = 0.4f;
|
||||
|
||||
|
||||
public double getX() {
|
||||
|
||||
@ -204,23 +201,22 @@ public abstract class BoundingBox {
|
||||
}
|
||||
|
||||
|
||||
public double verticalOverlap(BoundingBox other) {
|
||||
public double verticalOverlapPdf(BoundingBox other) {
|
||||
|
||||
return Math.max(0, Math.min(this.getPdfMaxY(), other.getPdfMaxY()) - Math.max(this.getPdfMinY(), other.getPdfMinY()));
|
||||
}
|
||||
|
||||
|
||||
public static final Comparator<BoundingBox> ILL_DEFINED_ORDER = (o1, o2) -> {
|
||||
public double verticalOverlap(BoundingBox other) {
|
||||
|
||||
if (o1.equals(o2)) {
|
||||
return 0;
|
||||
}
|
||||
if (o1.verticalOverlap(o2) > VERTICAL_COMPARISON_THRESHOLD * ((o1.getHeight() + o2.getHeight()) / 2)) {
|
||||
return Double.compare(o1.getPdfMinX(), o2.getPdfMinX());
|
||||
} else {
|
||||
return Double.compare(o1.getPdfMaxY(), o2.getPdfMaxY());
|
||||
}
|
||||
};
|
||||
return Math.max(0, Math.min(this.getMaxY(), other.getMaxY()) - Math.max(this.getMinY(), other.getMinY()));
|
||||
}
|
||||
|
||||
|
||||
public double horizontalOverlap(BoundingBox other) {
|
||||
|
||||
return Math.max(0, Math.min(this.getMaxX(), other.getMaxX()) - Math.max(this.getMinX(), other.getMinX()));
|
||||
}
|
||||
|
||||
|
||||
public double horizontalDistance(BoundingBox other) {
|
||||
@ -276,4 +272,13 @@ public abstract class BoundingBox {
|
||||
return this.intersectsX(other) && this.getMinY() >= other.getMaxY();
|
||||
}
|
||||
|
||||
|
||||
public double intersectedArea(BoundingBox r2) {
|
||||
|
||||
double xOverlap = horizontalOverlap(r2);
|
||||
double yOverlap = verticalOverlap(r2);
|
||||
|
||||
return xOverlap * yOverlap;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@ -2,7 +2,9 @@ package com.knecon.fforesight.service.layoutparser.processor.docstrum.model;
|
||||
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
import java.util.Map;
|
||||
import java.util.Optional;
|
||||
import java.util.function.Function;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextDirection;
|
||||
@ -36,19 +38,16 @@ public abstract class TextBoundingBox extends BoundingBox {
|
||||
.map(TextBoundingBox::getBBoxDirAdj)
|
||||
.collect(RectangleTransformations.collectBBox());
|
||||
|
||||
Set<TextDirection> textDirections = components.stream()
|
||||
Optional<TextDirection> mostCommonDir = components.stream()
|
||||
.filter(c -> c instanceof TextBoundingBox)
|
||||
.map(c -> (TextBoundingBox) c)
|
||||
.map(TextBoundingBox::getDir)
|
||||
.collect(Collectors.toSet());
|
||||
.collect(Collectors.groupingBy(Function.identity(), Collectors.counting())).entrySet()
|
||||
.stream()
|
||||
.max(Map.Entry.comparingByValue())
|
||||
.map(Map.Entry::getKey);
|
||||
|
||||
if (textDirections.isEmpty()) {
|
||||
dir = TextDirection.ZERO;
|
||||
} else if (textDirections.size() > 1) {
|
||||
throw new IllegalArgumentException("More than one text direction found");
|
||||
} else {
|
||||
dir = textDirections.iterator().next();
|
||||
}
|
||||
dir = mostCommonDir.orElse(TextDirection.ZERO);
|
||||
}
|
||||
|
||||
|
||||
|
||||
@ -9,7 +9,7 @@ import org.springframework.stereotype.Service;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.AngleFilter;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.Character;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.Line;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.UnionFind;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.UnionFind;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings;
|
||||
|
||||
@Service
|
||||
|
||||
@ -3,6 +3,7 @@ package com.knecon.fforesight.service.layoutparser.processor.docstrum.service;
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Comparator;
|
||||
import java.util.EnumMap;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.ListIterator;
|
||||
@ -12,25 +13,43 @@ import java.util.stream.Collectors;
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.TextBoundingBox;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.Zone;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.docstrum.utils.DoubleUtils;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.TablePageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextDirection;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
|
||||
|
||||
@Service
|
||||
public class ReadingOrderService {
|
||||
|
||||
private static final double THRESHOLD = 5;
|
||||
public static final double MULTI_COLUMN_DETECTION_THRESHOLD = 1.5;
|
||||
public static final double SAME_DIRECTION_THRESHOLD = 0.9;
|
||||
|
||||
private static final Comparator<TextBoundingBox> COMPARATOR = //
|
||||
Comparator.comparing(TextBoundingBox::getY, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD))
|
||||
.thenComparing(TextBoundingBox::getX, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD));
|
||||
Comparator.comparing(TextBoundingBox::getY,
|
||||
(o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD))
|
||||
.thenComparing(TextBoundingBox::getX,
|
||||
(o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD));
|
||||
|
||||
private static final Comparator<TextBoundingBox> COMPARATOR_DIR_ADJ = //
|
||||
Comparator.comparing(TextBoundingBox::getYDirAdj, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD))
|
||||
Comparator.comparing(TextBoundingBox::getYDirAdj,
|
||||
(o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD))
|
||||
.thenComparing(TextBoundingBox::getXDirAdj, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD));
|
||||
|
||||
|
||||
public List<Zone> resolve(List<Zone> zones, boolean xyReadingOrder, boolean useDirAdjCoords) {
|
||||
public List<AbstractPageBlock> resolve(List<TextPageBlock> textBlocks, List<TablePageBlock> tables) {
|
||||
|
||||
List<AbstractPageBlock> unsortedBlocks = new ArrayList<>(textBlocks.size() + tables.size());
|
||||
unsortedBlocks.addAll(textBlocks);
|
||||
unsortedBlocks.addAll(tables);
|
||||
return resolve(unsortedBlocks, false);
|
||||
}
|
||||
|
||||
|
||||
public <T extends TextBoundingBox> List<T> resolve(List<T> zones, boolean xyReadingOrder) {
|
||||
|
||||
boolean useDirAdjCoords = mostSameDirection(zones);
|
||||
|
||||
if (zones.isEmpty() || zones.size() == 1) {
|
||||
return zones;
|
||||
@ -41,7 +60,7 @@ public class ReadingOrderService {
|
||||
}
|
||||
|
||||
Map<Long, Integer> histogram = new HashMap<>();
|
||||
for (Zone zone : zones) {
|
||||
for (TextBoundingBox zone : zones) {
|
||||
Rectangle2D bbox = useDirAdjCoords ? zone.getBBoxDirAdj() : zone.getBBox();
|
||||
long minY = Math.round(bbox.getMinY());
|
||||
long maxY = Math.round(bbox.getMaxY());
|
||||
@ -52,8 +71,7 @@ public class ReadingOrderService {
|
||||
|
||||
if (histogram.values()
|
||||
.stream()
|
||||
.mapToInt(Integer::intValue).average()
|
||||
.orElse(1) < MULTI_COLUMN_DETECTION_THRESHOLD) {
|
||||
.mapToInt(Integer::intValue).average().orElse(1) < MULTI_COLUMN_DETECTION_THRESHOLD) {
|
||||
return resolveSingleColumnReadingOrder(zones, useDirAdjCoords);
|
||||
} else {
|
||||
|
||||
@ -63,7 +81,7 @@ public class ReadingOrderService {
|
||||
}
|
||||
|
||||
|
||||
private static List<Zone> resolveSingleColumnReadingOrder(List<Zone> zones, boolean useDirAdjCoords) {
|
||||
private static <T extends TextBoundingBox> List<T> resolveSingleColumnReadingOrder(List<T> zones, boolean useDirAdjCoords) {
|
||||
|
||||
if (useDirAdjCoords) {
|
||||
return zones.stream()
|
||||
@ -71,7 +89,7 @@ public class ReadingOrderService {
|
||||
.stream()
|
||||
.flatMap(words -> words.stream()
|
||||
.sorted(COMPARATOR_DIR_ADJ))
|
||||
.toList();
|
||||
.collect(Collectors.toList());
|
||||
}
|
||||
|
||||
zones.sort(COMPARATOR);
|
||||
@ -79,7 +97,7 @@ public class ReadingOrderService {
|
||||
}
|
||||
|
||||
|
||||
private List<Zone> resolveMultiColumnReadingOder(List<Zone> zones, boolean useDirAdjCoords) {
|
||||
private <T extends TextBoundingBox> List<T> resolveMultiColumnReadingOder(List<T> zones, boolean useDirAdjCoords) {
|
||||
|
||||
// Simple reading order resolver for multi column page layout as described here : https://pub.towardsai.net/advanced-rag-02-unveiling-pdf-parsing-b84ae866344e
|
||||
// TODO implement a more fancy reading order resolver see https://github.com/BobLd/DocumentLayoutAnalysis/blob/master/README.md#reading-order
|
||||
@ -87,7 +105,7 @@ public class ReadingOrderService {
|
||||
double minX = Double.POSITIVE_INFINITY;
|
||||
double maxX = Double.NEGATIVE_INFINITY;
|
||||
|
||||
for (Zone zone : zones) {
|
||||
for (T zone : zones) {
|
||||
Rectangle2D bbox = useDirAdjCoords ? zone.getBBoxDirAdj() : zone.getBBox();
|
||||
if (bbox.getX() < minX) {
|
||||
minX = zone.getXDirAdj();
|
||||
@ -99,11 +117,11 @@ public class ReadingOrderService {
|
||||
|
||||
double midLineXCoordinate = (minX + maxX) / 2;
|
||||
|
||||
List<Zone> leftOf = new ArrayList<>();
|
||||
List<Zone> rightOf = new ArrayList<>();
|
||||
List<Zone> middle = new ArrayList<>();
|
||||
List<T> leftOf = new ArrayList<>();
|
||||
List<T> rightOf = new ArrayList<>();
|
||||
List<T> middle = new ArrayList<>();
|
||||
|
||||
for (Zone zone : zones) {
|
||||
for (T zone : zones) {
|
||||
Rectangle2D bbox = useDirAdjCoords ? zone.getBBoxDirAdj() : zone.getBBox();
|
||||
if (bbox.getX() < midLineXCoordinate && bbox.getX() + bbox.getWidth() < midLineXCoordinate) {
|
||||
leftOf.add(zone);
|
||||
@ -166,14 +184,14 @@ public class ReadingOrderService {
|
||||
middle.addAll(leftNotIntersecting);
|
||||
middle.addAll(rightNotIntersecting);
|
||||
*/
|
||||
List<Zone> sortedZones = new ArrayList<>();
|
||||
List<T> sortedZones = new ArrayList<>();
|
||||
sortedZones.addAll(leftOf);
|
||||
sortedZones.addAll(rightOf);
|
||||
|
||||
ListIterator<Zone> itty = middle.listIterator();
|
||||
ListIterator<T> itty = middle.listIterator();
|
||||
|
||||
while (itty.hasNext()) {
|
||||
Zone current = itty.next();
|
||||
T current = itty.next();
|
||||
Rectangle2D bbox = useDirAdjCoords ? current.getBBoxDirAdj() : current.getBBox();
|
||||
for (int i = 0; i < sortedZones.size(); i++) {
|
||||
if (bbox.getY() < sortedZones.get(i).getY()) {
|
||||
@ -189,4 +207,29 @@ public class ReadingOrderService {
|
||||
return sortedZones;
|
||||
}
|
||||
|
||||
|
||||
private boolean mostSameDirection(List<? extends TextBoundingBox> zones) {
|
||||
|
||||
EnumMap<TextDirection, Integer> directionCounts = new EnumMap<>(TextDirection.class);
|
||||
|
||||
for (TextBoundingBox zone : zones) {
|
||||
TextDirection dir = zone.getDir();
|
||||
directionCounts.put(dir, directionCounts.getOrDefault(dir, 0) + 1);
|
||||
}
|
||||
int total = directionCounts.values()
|
||||
.stream()
|
||||
.mapToInt(i -> i).sum();
|
||||
|
||||
if ((double) directionCounts.getOrDefault(TextDirection.ZERO, 0) / total > SAME_DIRECTION_THRESHOLD) {
|
||||
return true;
|
||||
} else if ((double) directionCounts.getOrDefault(TextDirection.QUARTER_CIRCLE, 0) / total > SAME_DIRECTION_THRESHOLD) {
|
||||
return true;
|
||||
} else if ((double) directionCounts.getOrDefault(TextDirection.HALF_CIRCLE, 0) / total > SAME_DIRECTION_THRESHOLD) {
|
||||
return true;
|
||||
} else if ((double) directionCounts.getOrDefault(TextDirection.THREE_QUARTER_CIRCLE, 0) / total > SAME_DIRECTION_THRESHOLD) {
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@ -10,7 +10,7 @@ import java.util.stream.Collectors;
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.Line;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.UnionFind;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.UnionFind;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.Zone;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.Word;
|
||||
|
||||
@ -1,4 +1,4 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.services;
|
||||
package com.knecon.fforesight.service.layoutparser.processor.experimental;
|
||||
|
||||
import java.awt.geom.Line2D;
|
||||
import java.awt.geom.Rectangle2D;
|
||||
@ -7,7 +7,6 @@ import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
import java.util.stream.Stream;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.GapInformation;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.PageContents;
|
||||
|
||||
import lombok.experimental.UtilityClass;
|
||||
@ -23,13 +22,13 @@ public class DividingColumnDetectionService {
|
||||
public List<Rectangle2D> detectColumns(PageContents pageContents) {
|
||||
|
||||
|
||||
if (pageContents.getSortedWords().size() < 2) {
|
||||
return List.of(pageContents.getCropBox());
|
||||
if (pageContents.getWords().size() < 2) {
|
||||
return List.of(pageContents.getPageInformation().cropBox());
|
||||
}
|
||||
|
||||
GapInformation linesWithGapInformation = GapDetectionService.findGapsInLines(pageContents.getSortedWords(), pageContents.getCropBox());
|
||||
GapInformation linesWithGapInformation = GapDetectionService.findGapsInLines(pageContents.getWords(), pageContents.getPageInformation().cropBox());
|
||||
|
||||
return detectColumnsFromLines(linesWithGapInformation.getXGaps(), pageContents.getCropBox());
|
||||
return detectColumnsFromLines(linesWithGapInformation.getXGaps(), pageContents.getPageInformation().cropBox());
|
||||
}
|
||||
|
||||
|
||||
@ -1,10 +1,9 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.services;
|
||||
package com.knecon.fforesight.service.layoutparser.processor.experimental;
|
||||
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.GapInformation;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.Word;
|
||||
|
||||
import lombok.AllArgsConstructor;
|
||||
@ -1,4 +1,4 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.model;
|
||||
package com.knecon.fforesight.service.layoutparser.processor.experimental;
|
||||
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.util.LinkedList;
|
||||
@ -1,4 +1,4 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.services;
|
||||
package com.knecon.fforesight.service.layoutparser.processor.experimental;
|
||||
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.awt.geom.RectangularShape;
|
||||
@ -6,9 +6,6 @@ import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
import java.util.Queue;
|
||||
import java.util.stream.Stream;
|
||||
import com.iqser.red.commons.jackson.ObjectMapperFactory;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.GapInformation;
|
||||
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Getter;
|
||||
@ -1,12 +1,10 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.services;
|
||||
package com.knecon.fforesight.service.layoutparser.processor.experimental;
|
||||
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.awt.geom.RectangularShape;
|
||||
import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.GapInformation;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.LineInformation;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.Word;
|
||||
|
||||
import lombok.experimental.UtilityClass;
|
||||
@ -1,11 +1,9 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.services;
|
||||
package com.knecon.fforesight.service.layoutparser.processor.experimental;
|
||||
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.GapInformation;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.LineInformation;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.Word;
|
||||
|
||||
@ -1,4 +1,4 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.model;
|
||||
package com.knecon.fforesight.service.layoutparser.processor.experimental;
|
||||
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.util.List;
|
||||
@ -1,12 +1,15 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.model;
|
||||
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
|
||||
import com.fasterxml.jackson.annotation.JsonIgnore;
|
||||
import com.iqser.red.service.redaction.v1.server.model.document.nodes.LayoutEngine;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.BoundingBox;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.TextBoundingBox;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.Word;
|
||||
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Data;
|
||||
@ -17,18 +20,18 @@ import lombok.NoArgsConstructor;
|
||||
@AllArgsConstructor
|
||||
@NoArgsConstructor
|
||||
@EqualsAndHashCode(callSuper = true)
|
||||
public abstract class AbstractPageBlock extends BoundingBox {
|
||||
public abstract class AbstractPageBlock extends TextBoundingBox {
|
||||
|
||||
@JsonIgnore
|
||||
protected PageBlockType classification;
|
||||
|
||||
Set<LayoutEngine> engines = new HashSet<>();
|
||||
protected Set<LayoutEngine> engines = new HashSet<>();
|
||||
|
||||
@JsonIgnore
|
||||
protected int page;
|
||||
|
||||
@JsonIgnore
|
||||
private Orientation orientation = Orientation.NONE;
|
||||
protected Orientation orientation = Orientation.NONE;
|
||||
|
||||
|
||||
public abstract String getText();
|
||||
@ -42,4 +45,6 @@ public abstract class AbstractPageBlock extends BoundingBox {
|
||||
|
||||
public abstract boolean isEmpty();
|
||||
|
||||
public abstract List<Word> getWords();
|
||||
|
||||
}
|
||||
|
||||
@ -5,6 +5,8 @@ import java.util.List;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.outline.OutlineObjectTree;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.outline.SectionTree;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.ClassificationFooter;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.ClassificationHeader;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.StringFrequencyCounter;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.UnclassifiedText;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.visualization.LayoutDebugLayer;
|
||||
|
||||
@ -1,5 +1,6 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.model;
|
||||
|
||||
import java.awt.geom.AffineTransform;
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashMap;
|
||||
@ -11,29 +12,38 @@ import com.knecon.fforesight.service.layoutparser.processor.model.image.Classifi
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.outline.OutlineObject;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.StringFrequencyCounter;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.CoordinateTransforms;
|
||||
|
||||
import lombok.Data;
|
||||
import lombok.NonNull;
|
||||
import lombok.RequiredArgsConstructor;
|
||||
|
||||
@Data
|
||||
@RequiredArgsConstructor
|
||||
|
||||
public class ClassificationPage {
|
||||
|
||||
public ClassificationPage(List<AbstractPageBlock> pageBlocks, PageInformation pageInformation, CleanRulings cleanRulings) {
|
||||
|
||||
this.cleanRulings = cleanRulings;
|
||||
this.pageNumber = pageInformation.number();
|
||||
this.textBlocks = pageBlocks;
|
||||
var mediaBox = pageInformation.mediabox();
|
||||
int rotation = pageInformation.rotationDegrees();
|
||||
this.landscape = mediaBox.getWidth() > mediaBox.getHeight() && (rotation == 0 || rotation == 180) //
|
||||
|| mediaBox.getHeight() > mediaBox.getWidth() && (rotation == 90 || rotation == 270);
|
||||
this.pageInformation = pageInformation;
|
||||
}
|
||||
|
||||
|
||||
private PageInformation pageInformation;
|
||||
@NonNull
|
||||
private List<AbstractPageBlock> textBlocks;
|
||||
|
||||
private List<OutlineObject> outlineObjects = new ArrayList<>();
|
||||
|
||||
private List<AbstractPageBlock> headlines = new ArrayList<>();
|
||||
|
||||
private List<ClassifiedImage> images = new ArrayList<>();
|
||||
|
||||
private Rectangle bodyTextFrame;
|
||||
|
||||
private boolean landscape;
|
||||
private int rotation;
|
||||
|
||||
private int pageNumber;
|
||||
|
||||
@ -42,11 +52,32 @@ public class ClassificationPage {
|
||||
private StringFrequencyCounter fontCounter = new StringFrequencyCounter();
|
||||
private StringFrequencyCounter fontStyleCounter = new StringFrequencyCounter();
|
||||
|
||||
private float pageWidth;
|
||||
private float pageHeight;
|
||||
|
||||
private CleanRulings cleanRulings;
|
||||
|
||||
private Map<String, List<Rectangle2D>> markedContentBboxPerType = new HashMap<>();
|
||||
|
||||
|
||||
public AffineTransform getPdfToPageTransform() {
|
||||
|
||||
return CoordinateTransforms.calculateInitialUserSpaceCoordsToPageCoords(getPageInformation());
|
||||
}
|
||||
|
||||
|
||||
public int getRotation() {
|
||||
|
||||
return pageInformation.rotationDegrees();
|
||||
}
|
||||
|
||||
|
||||
public float getPageWidth() {
|
||||
|
||||
return (float) pageInformation.width();
|
||||
}
|
||||
|
||||
|
||||
public float getPageHeight() {
|
||||
|
||||
return (float) pageInformation.height();
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@ -1,10 +1,10 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.model;
|
||||
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.util.List;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.Word;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.graphics.Box;
|
||||
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Builder;
|
||||
@ -15,8 +15,9 @@ import lombok.Getter;
|
||||
@AllArgsConstructor
|
||||
public class PageContents {
|
||||
|
||||
List<Word> sortedWords;
|
||||
Rectangle2D cropBox;
|
||||
Rectangle2D mediaBox;
|
||||
PageInformation pageInformation;
|
||||
List<Word> words;
|
||||
List<Ruling> rulings;
|
||||
List<Box> graphicBBoxes;
|
||||
|
||||
}
|
||||
|
||||
@ -2,16 +2,63 @@ package com.knecon.fforesight.service.layoutparser.processor.model;
|
||||
|
||||
import java.awt.geom.Rectangle2D;
|
||||
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Getter;
|
||||
import org.apache.pdfbox.pdmodel.PDPage;
|
||||
import org.apache.pdfbox.pdmodel.common.PDRectangle;
|
||||
|
||||
@Getter
|
||||
@AllArgsConstructor
|
||||
public class PageInformation {
|
||||
import com.iqser.red.service.redaction.v1.server.model.document.nodes.Page;
|
||||
|
||||
PageContents pageContents;
|
||||
LineInformation lineInformation;
|
||||
Rectangle2D mainBodyTextFrame;
|
||||
GapInformation gapInformation;
|
||||
public record PageInformation(Rectangle2D mediabox, Rectangle2D cropBox, int number, int rotationDegrees) {
|
||||
|
||||
}
|
||||
public static PageInformation fromPDPage(int pageNum, PDPage page) {
|
||||
|
||||
PDRectangle mediaBox = page.getMediaBox();
|
||||
PDRectangle cropBox = page.getCropBox();
|
||||
return new PageInformation(new Rectangle2D.Double(mediaBox.getLowerLeftX(), mediaBox.getLowerLeftY(), mediaBox.getWidth(), mediaBox.getHeight()),
|
||||
new Rectangle2D.Double(cropBox.getLowerLeftX(), cropBox.getLowerLeftY(), cropBox.getWidth(), cropBox.getHeight()),
|
||||
pageNum,
|
||||
page.getRotation());
|
||||
}
|
||||
|
||||
|
||||
public static PageInformation fromPage(Page page) {
|
||||
|
||||
return new PageInformation(new Rectangle2D.Double(0, 0, page.getWidth(), page.getHeight()),
|
||||
new Rectangle2D.Double(0, 0, page.getWidth(), page.getHeight()),
|
||||
page.getNumber(),
|
||||
page.getRotation());
|
||||
}
|
||||
|
||||
|
||||
public double height() {
|
||||
|
||||
return mediabox.getHeight();
|
||||
}
|
||||
|
||||
|
||||
public double heightRot() {
|
||||
|
||||
if (rotationDegrees == 90 || rotationDegrees == 270) {
|
||||
return width();
|
||||
}
|
||||
return height();
|
||||
}
|
||||
|
||||
|
||||
public double width() {
|
||||
|
||||
return mediabox.getWidth();
|
||||
}
|
||||
|
||||
|
||||
public double minX() {
|
||||
|
||||
return mediabox.getX();
|
||||
}
|
||||
|
||||
|
||||
public double minY() {
|
||||
|
||||
return mediabox.getY();
|
||||
}
|
||||
|
||||
}
|
||||
@ -4,6 +4,7 @@ import java.util.Collections;
|
||||
import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
import java.util.Locale;
|
||||
import java.util.Objects;
|
||||
import java.util.regex.Matcher;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
@ -77,7 +78,7 @@ public class SectionIdentifier {
|
||||
List<Integer> identifiers = new LinkedList<>();
|
||||
for (int i = 1; i <= 4; i++) {
|
||||
String numericalIdentifier = numericalIdentifierMatcher.group(i);
|
||||
if (numericalIdentifier == null || numericalIdentifier.equals("0") || numericalIdentifier.isEmpty() || numericalIdentifier.isBlank()) {
|
||||
if (numericalIdentifier == null || Objects.equals(numericalIdentifier, "0") || numericalIdentifier.isBlank()) {
|
||||
break;
|
||||
}
|
||||
identifiers.add(Integer.parseInt(numericalIdentifier.trim()));
|
||||
|
||||
@ -1,4 +1,4 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.docstrum.model;
|
||||
package com.knecon.fforesight.service.layoutparser.processor.model;
|
||||
|
||||
import java.util.Collection;
|
||||
import java.util.LinkedHashMap;
|
||||
@ -2,12 +2,14 @@ package com.knecon.fforesight.service.layoutparser.processor.model.outline;
|
||||
|
||||
import java.awt.geom.AffineTransform;
|
||||
import java.awt.geom.Point2D;
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.Locale;
|
||||
import java.util.Optional;
|
||||
|
||||
import org.apache.pdfbox.Loader;
|
||||
import org.apache.pdfbox.cos.COSArray;
|
||||
import org.apache.pdfbox.cos.COSBase;
|
||||
import org.apache.pdfbox.cos.COSDictionary;
|
||||
@ -28,7 +30,7 @@ import org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDOutlin
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.CoordinateTransforms;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.PageInformation;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.PageInformation;
|
||||
|
||||
import lombok.SneakyThrows;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
@ -48,19 +50,22 @@ public class OutlineExtractorService {
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
public OutlineObjectTree getOutlineObjectTree(PDDocument document) {
|
||||
public OutlineObjectTree getOutlineObjectTree(File documentFile) {
|
||||
|
||||
PDDocumentOutline documentOutline = document.getDocumentCatalog().getDocumentOutline();
|
||||
try (var document = Loader.loadPDF(documentFile)) {
|
||||
|
||||
List<OutlineObjectTreeNode> rootNodes = new ArrayList<>();
|
||||
if (documentOutline != null) {
|
||||
for (PDOutlineItem child : documentOutline.children()) {
|
||||
Optional<OutlineObjectTreeNode> outlineObjectWithChildren = createOutlineObjectWithChildren(child, document, 1);
|
||||
outlineObjectWithChildren.ifPresent(rootNodes::add);
|
||||
PDDocumentOutline documentOutline = document.getDocumentCatalog().getDocumentOutline();
|
||||
|
||||
List<OutlineObjectTreeNode> rootNodes = new ArrayList<>();
|
||||
if (documentOutline != null) {
|
||||
for (PDOutlineItem child : documentOutline.children()) {
|
||||
Optional<OutlineObjectTreeNode> outlineObjectWithChildren = createOutlineObjectWithChildren(child, document, 1);
|
||||
outlineObjectWithChildren.ifPresent(rootNodes::add);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return new OutlineObjectTree(rootNodes);
|
||||
return new OutlineObjectTree(rootNodes);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@ -128,9 +133,7 @@ public class OutlineExtractorService {
|
||||
log.info(String.format("Error occurred during position resolution for outline item on page %s with title %s: " + e, pageNumber, title));
|
||||
}
|
||||
|
||||
return Optional.of(new OutlineObjectTreeNode(new OutlineObject(title,
|
||||
pageNumber,
|
||||
transformPointToPageCoords(outlinePosition, userSpaceToPageCoords), depth)));
|
||||
return Optional.of(new OutlineObjectTreeNode(new OutlineObject(title, pageNumber, transformPointToPageCoords(outlinePosition, userSpaceToPageCoords), depth)));
|
||||
}
|
||||
|
||||
|
||||
|
||||
@ -10,8 +10,8 @@ import org.springframework.stereotype.Service;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationDocument;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationFooter;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationHeader;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.ClassificationFooter;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.ClassificationHeader;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.PageBlockType;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.image.ClassifiedImage;
|
||||
|
||||
@ -4,13 +4,15 @@ import java.awt.geom.AffineTransform;
|
||||
import java.awt.geom.Point2D;
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Iterator;
|
||||
import java.util.Collection;
|
||||
import java.util.List;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.BoundingBox;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.Word;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.TextNormalizationUtilities;
|
||||
import com.knecon.fforesight.service.ocr.v1.api.model.TableCell;
|
||||
|
||||
import lombok.Data;
|
||||
import lombok.EqualsAndHashCode;
|
||||
@ -22,7 +24,7 @@ import lombok.NoArgsConstructor;
|
||||
@NoArgsConstructor
|
||||
public class Cell extends BoundingBox {
|
||||
|
||||
private List<TextPageBlock> textBlocks = new ArrayList<>();
|
||||
private List<AbstractPageBlock> textBlocks = new ArrayList<>();
|
||||
|
||||
private List<Cell> headerCells = new ArrayList<>();
|
||||
|
||||
@ -33,17 +35,41 @@ public class Cell extends BoundingBox {
|
||||
private int pageNumber;
|
||||
|
||||
|
||||
public Cell(Point2D topLeft, Point2D bottomRight) {
|
||||
public Cell(Point2D topLeft, Point2D bottomRight, AffineTransform pdfToPageTransform) {
|
||||
|
||||
this.bBoxPdf = new Rectangle2D.Double(topLeft.getX(), topLeft.getY(), (bottomRight.getX() - topLeft.getX()), (bottomRight.getY() - topLeft.getY()));
|
||||
this.bBox = bBoxPdf;
|
||||
this.bBox = RectangleTransformations.transform(bBoxPdf, pdfToPageTransform);
|
||||
}
|
||||
|
||||
|
||||
public Cell(Rectangle2D bBoxInitialUserSpace, AffineTransform initialUserSpaceToJava) {
|
||||
public static Cell fromPageCoordinates(Point2D topLeft, Point2D bottomRight, AffineTransform pageToPdfTransform) {
|
||||
|
||||
var bBox = new Rectangle2D.Double(topLeft.getX(), topLeft.getY(), (bottomRight.getX() - topLeft.getX()), (bottomRight.getY() - topLeft.getY()));
|
||||
return fromPageCoordinates(bBox, pageToPdfTransform);
|
||||
}
|
||||
|
||||
|
||||
public static Cell fromPageCoordinates(Rectangle2D r, AffineTransform pageToPdfTransform) {
|
||||
|
||||
Cell cell = new Cell();
|
||||
var bBoxPdf = RectangleTransformations.transform(r, pageToPdfTransform);
|
||||
cell.bBox = r;
|
||||
cell.bBoxPdf = bBoxPdf;
|
||||
return cell;
|
||||
}
|
||||
|
||||
|
||||
public Cell(TableCell tableCell, AffineTransform pdfToPageTransform) {
|
||||
|
||||
this.bBoxPdf = tableCell.textRegion().region().bbox().get().getBounds2D();
|
||||
this.bBox = RectangleTransformations.transform(bBoxPdf, pdfToPageTransform);
|
||||
}
|
||||
|
||||
|
||||
public Cell(Rectangle2D bBoxInitialUserSpace, AffineTransform pdfToPageTransform) {
|
||||
|
||||
this.bBoxPdf = bBoxInitialUserSpace;
|
||||
this.bBox = initialUserSpaceToJava.createTransformedShape(bBoxInitialUserSpace).getBounds2D();
|
||||
this.bBox = RectangleTransformations.transform(bBoxPdf, pdfToPageTransform);
|
||||
}
|
||||
|
||||
|
||||
@ -56,9 +82,12 @@ public class Cell extends BoundingBox {
|
||||
}
|
||||
|
||||
|
||||
public void addTextBlock(TextPageBlock textBlock) {
|
||||
public List<Word> getWords() {
|
||||
|
||||
textBlocks.add(textBlock);
|
||||
return getTextBlocks().stream()
|
||||
.map(AbstractPageBlock::getWords)
|
||||
.flatMap(Collection::stream)
|
||||
.toList();
|
||||
}
|
||||
|
||||
|
||||
@ -67,24 +96,12 @@ public class Cell extends BoundingBox {
|
||||
|
||||
StringBuilder sb = new StringBuilder();
|
||||
|
||||
Iterator<TextPageBlock> itty = textBlocks.iterator();
|
||||
Word previous = null;
|
||||
while (itty.hasNext()) {
|
||||
|
||||
TextPageBlock textBlock = itty.next();
|
||||
|
||||
for (Word word : textBlock.getWords()) {
|
||||
if (previous != null) {
|
||||
if (Math.abs(previous.getMaxYDirAdj() - word.getMaxYDirAdj()) > word.getTextHeight()) {
|
||||
sb.append('\n');
|
||||
} else {
|
||||
sb.append(' ');
|
||||
}
|
||||
}
|
||||
sb.append(word.toString());
|
||||
previous = word;
|
||||
for (int i = 0; i < textBlocks.size(); i++) {
|
||||
AbstractPageBlock textBlock = textBlocks.get(i);
|
||||
sb.append(textBlock);
|
||||
if (i < textBlocks.size() - 1) {
|
||||
sb.append("\n");
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
return TextNormalizationUtilities.cleanString(sb.toString());
|
||||
|
||||
@ -22,6 +22,12 @@ public class CleanRulings {
|
||||
List<Ruling> verticals; // unmodifiable sorted by X list
|
||||
|
||||
|
||||
public static CleanRulings empty() {
|
||||
|
||||
return new CleanRulings(Collections.emptyList(), Collections.emptyList());
|
||||
}
|
||||
|
||||
|
||||
public CleanRulings(List<Ruling> horizontals, List<Ruling> verticals) {
|
||||
|
||||
this.horizontals = horizontals.stream()
|
||||
|
||||
@ -30,15 +30,24 @@ public class Ruling extends Line2D.Float {
|
||||
OTHER
|
||||
}
|
||||
|
||||
public enum Style {
|
||||
SOLID,
|
||||
DASHED
|
||||
}
|
||||
|
||||
@Getter
|
||||
@Setter
|
||||
private Classification classification;
|
||||
@Getter
|
||||
@Setter
|
||||
private Style style;
|
||||
|
||||
|
||||
public Ruling(Point2D p1, Point2D p2) {
|
||||
|
||||
super(p1, p2);
|
||||
this.classification = Classification.OTHER;
|
||||
this.style = Style.SOLID;
|
||||
}
|
||||
|
||||
|
||||
|
||||
@ -1,48 +1,48 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.model.table;
|
||||
|
||||
import java.awt.geom.Point2D;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collections;
|
||||
import java.util.Comparator;
|
||||
import java.util.HashSet;
|
||||
import java.util.Collection;
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
import java.util.TreeMap;
|
||||
import java.util.function.Function;
|
||||
import java.util.stream.Collectors;
|
||||
import java.util.stream.Stream;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.BoundingBox;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.PageBlockType;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.Word;
|
||||
|
||||
import lombok.Getter;
|
||||
import lombok.Setter;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
|
||||
@Slf4j
|
||||
@Getter
|
||||
public class TablePageBlock extends AbstractPageBlock {
|
||||
|
||||
public static final double CELL_AREA_CONTAINED_THRESHOLD = 0.98;
|
||||
private final TreeMap<CellPosition, Cell> cellTreeMap = new TreeMap<>();
|
||||
private final TextPageBlock caption;
|
||||
|
||||
private final int rotation;
|
||||
@Getter
|
||||
@Setter
|
||||
private String headline;
|
||||
private int unrotatedRowCount;
|
||||
private int unrotatedColCount;
|
||||
private List<List<Cell>> rows;
|
||||
@Getter
|
||||
@Setter
|
||||
private List<Cell> cells;
|
||||
private final List<List<Cell>> rows;
|
||||
|
||||
|
||||
public TablePageBlock(List<Cell> cells, int rotation) {
|
||||
public TablePageBlock(TextPageBlock caption, List<List<Cell>> rows) {
|
||||
|
||||
setToBBoxOfComponents(cells);
|
||||
this.cells = cells;
|
||||
addCells(cells);
|
||||
classification = PageBlockType.TABLE;
|
||||
this.rotation = rotation;
|
||||
this.classification = PageBlockType.TABLE;
|
||||
this.caption = caption;
|
||||
this.rows = rows;
|
||||
setBBoxes();
|
||||
}
|
||||
|
||||
|
||||
private void setBBoxes() {
|
||||
|
||||
List<BoundingBox> components = Stream.of(getCells().stream(),
|
||||
getCells().stream()
|
||||
.map(Cell::getTextBlocks)
|
||||
.flatMap(Collection::stream))
|
||||
.flatMap(Function.identity())
|
||||
.map(o -> (BoundingBox) o)
|
||||
.toList();
|
||||
setToBBoxOfComponents(components);
|
||||
}
|
||||
|
||||
|
||||
@ -53,28 +53,19 @@ public class TablePageBlock extends AbstractPageBlock {
|
||||
}
|
||||
|
||||
|
||||
public List<List<Cell>> getRows() {
|
||||
|
||||
if (rows == null) {
|
||||
rows = computeRows();
|
||||
|
||||
// Ignore rows that does not contain any cells and values.
|
||||
List<List<Cell>> rowsToRemove = new ArrayList<>();
|
||||
for (List<Cell> row : rows) {
|
||||
if (row.size() == 1 && row.get(0).getTextBlocks().isEmpty()) {
|
||||
rowsToRemove.add(row);
|
||||
}
|
||||
}
|
||||
rows.removeAll(rowsToRemove);
|
||||
|
||||
computeHeaders();
|
||||
}
|
||||
|
||||
return rows;
|
||||
@Override
|
||||
public List<Word> getWords() {
|
||||
|
||||
return getCells().stream()
|
||||
.map(Cell::getTextBlocks)
|
||||
.flatMap(Collection::stream)
|
||||
.map(AbstractPageBlock::getWords)
|
||||
.flatMap(Collection::stream)
|
||||
.toList();
|
||||
}
|
||||
|
||||
|
||||
|
||||
public int getRowCount() {
|
||||
|
||||
return getRows().size();
|
||||
@ -85,259 +76,16 @@ public class TablePageBlock extends AbstractPageBlock {
|
||||
|
||||
return getRows().stream()
|
||||
.mapToInt(List::size)
|
||||
.max()
|
||||
.orElse(0);
|
||||
.max().orElse(0);
|
||||
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Detect header cells (either first row or first column):
|
||||
* Column is marked as header if originalCell text is bold and row originalCell text is not bold.
|
||||
* Defaults to row.
|
||||
*/
|
||||
private void computeHeaders() {
|
||||
|
||||
if (rows == null) {
|
||||
rows = computeRows();
|
||||
}
|
||||
// A bold originalCell is a header originalCell as long as every originalCell to the left/top is bold, too
|
||||
// we move from left to right and top to bottom
|
||||
for (int rowIndex = 0; rowIndex < rows.size(); rowIndex++) {
|
||||
List<Cell> rowCells = rows.get(rowIndex);
|
||||
if (rowCells.size() == 1) {
|
||||
continue;
|
||||
}
|
||||
|
||||
for (int colIndex = 0; colIndex < rowCells.size(); colIndex++) {
|
||||
Cell cell = rowCells.get(colIndex);
|
||||
List<Cell> cellsToTheLeft = rowCells.subList(0, colIndex);
|
||||
Cell lastHeaderCell = null;
|
||||
for (Cell leftCell : cellsToTheLeft) {
|
||||
if (leftCell.isHeaderCell()) {
|
||||
lastHeaderCell = leftCell;
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (lastHeaderCell != null) {
|
||||
cell.getHeaderCells().add(lastHeaderCell);
|
||||
}
|
||||
List<Cell> cellsToTheTop = new ArrayList<>();
|
||||
for (int i = 0; i < rowIndex; i++) {
|
||||
try {
|
||||
cellsToTheTop.add(rows.get(i)
|
||||
.get(colIndex));
|
||||
} catch (IndexOutOfBoundsException e) {
|
||||
log.debug("No cell {} in row {}, ignoring.", colIndex, rowIndex);
|
||||
}
|
||||
}
|
||||
for (Cell topCell : cellsToTheTop) {
|
||||
if (topCell.isHeaderCell()) {
|
||||
lastHeaderCell = topCell;
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (lastHeaderCell != null) {
|
||||
cell.getHeaderCells().add(lastHeaderCell);
|
||||
}
|
||||
if (!cell.getTextBlocks().isEmpty() && cell.getTextBlocks()
|
||||
.get(0).getMostPopularWordStyle().equals("bold")) {
|
||||
cell.setHeaderCell(true);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
private List<List<Cell>> computeRows() {
|
||||
|
||||
List<List<Cell>> rows = new ArrayList<>();
|
||||
if (rotation == 90) {
|
||||
for (int i = 0; i < unrotatedColCount; i++) { // rows
|
||||
List<Cell> lastRow = new ArrayList<>();
|
||||
for (int j = unrotatedRowCount - 1; j >= 0; j--) { // cols
|
||||
Cell cell = cellTreeMap.get(new CellPosition(j, i));
|
||||
if (cell != null) {
|
||||
lastRow.add(cell);
|
||||
}
|
||||
}
|
||||
rows.add(lastRow);
|
||||
}
|
||||
} else if (rotation == 270) {
|
||||
for (int i = unrotatedColCount - 1; i >= 0; i--) { // rows
|
||||
List<Cell> lastRow = new ArrayList<>();
|
||||
for (int j = 0; j < unrotatedRowCount; j++) { // cols
|
||||
Cell cell = cellTreeMap.get(new CellPosition(j, i));
|
||||
if (cell != null) {
|
||||
lastRow.add(cell);
|
||||
}
|
||||
}
|
||||
rows.add(lastRow);
|
||||
}
|
||||
} else {
|
||||
for (int i = 0; i < unrotatedRowCount; i++) {
|
||||
List<Cell> lastRow = new ArrayList<>();
|
||||
for (int j = 0; j < unrotatedColCount; j++) {
|
||||
Cell cell = cellTreeMap.get(new CellPosition(i, j)); // JAVA_8 use getOrDefault()
|
||||
if (cell != null) {
|
||||
lastRow.add(cell);
|
||||
}
|
||||
}
|
||||
rows.add(lastRow);
|
||||
}
|
||||
}
|
||||
|
||||
return rows;
|
||||
|
||||
}
|
||||
|
||||
|
||||
private void addCells(List<Cell> cells) {
|
||||
|
||||
if (cells.isEmpty()) {
|
||||
return;
|
||||
}
|
||||
|
||||
cells.removeIf(cell -> cell.getWidth() < 1.1 || cell.getHeight() < 1.1);
|
||||
|
||||
List<List<Cell>> rowsOfCellsMatrix = calculateTableStructure(cells);
|
||||
|
||||
for (int i = 0; i < rowsOfCellsMatrix.size(); i++) {
|
||||
for (int j = 0; j < rowsOfCellsMatrix.get(i).size(); j++) {
|
||||
addCellToRowAndCol(rowsOfCellsMatrix.get(i)
|
||||
.get(j), i, j);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Calculates the structure of the table. For spanning rows and columns multiple cells with the same values will be inserted.
|
||||
*
|
||||
* @param cells The found cells
|
||||
* @return TablePageBlock Structure as a rows of cells matrix
|
||||
*/
|
||||
private List<List<Cell>> calculateTableStructure(List<Cell> cells) {
|
||||
|
||||
if (cells.isEmpty()) {
|
||||
return new ArrayList<>();
|
||||
}
|
||||
|
||||
Set<Double> uniqueX = new HashSet<>();
|
||||
Set<Double> uniqueY = new HashSet<>();
|
||||
cells.stream()
|
||||
.filter(c -> !c.getTextBlocks().isEmpty() || c.getHeight() > 3 && c.getWidth() > 3)
|
||||
.forEach(c -> {
|
||||
uniqueX.add(c.getPdfMinX());
|
||||
uniqueX.add(c.getPdfMaxX());
|
||||
uniqueY.add(c.getPdfMinY());
|
||||
uniqueY.add(c.getPdfMaxY());
|
||||
});
|
||||
|
||||
var sortedUniqueX = uniqueX.stream()
|
||||
.sorted()
|
||||
.toList();
|
||||
var sortedUniqueY = uniqueY.stream()
|
||||
.sorted()
|
||||
.toList();
|
||||
|
||||
List<List<Cell>> rowsOfCells = new ArrayList<>();
|
||||
|
||||
Double prevY = null;
|
||||
|
||||
for (Double y : sortedUniqueY) {
|
||||
|
||||
List<Cell> row = new ArrayList<>();
|
||||
|
||||
Double prevX = null;
|
||||
for (Double x : sortedUniqueX) {
|
||||
|
||||
if (prevY != null && prevX != null) {
|
||||
var cellFromGridStructure = new Cell(new Point2D.Double(prevX, prevY), new Point2D.Double(x, y));
|
||||
|
||||
if (cellFromGridStructure.hasMinimumSize()) {
|
||||
|
||||
cells.stream()
|
||||
.map(originalCell -> new CellWithIntersection(originalCell,
|
||||
RectangleTransformations.calculateIntersectedArea(cellFromGridStructure.getBBoxPdf(),
|
||||
originalCell.getBBoxPdf())))
|
||||
.filter(cellWithIntersection -> cellWithIntersection.intersectedArea > 0)
|
||||
.filter(cellWithIntersection -> cellWithIntersection.originalCell.getArea() > cellWithIntersection.intersectedArea * CELL_AREA_CONTAINED_THRESHOLD)
|
||||
.max(Comparator.comparing(CellWithIntersection::intersectedArea))
|
||||
.map(CellWithIntersection::originalCell)
|
||||
.ifPresent(matchingCell -> cellFromGridStructure.getTextBlocks().addAll(matchingCell.getTextBlocks()));
|
||||
|
||||
row.add(cellFromGridStructure);
|
||||
}
|
||||
}
|
||||
prevX = x;
|
||||
}
|
||||
|
||||
// exclude empty rows and rows where all text blocks are empty
|
||||
if (prevY != null && prevX != null && !row.isEmpty() && !row.stream()
|
||||
.allMatch(cell -> cell.getTextBlocks().isEmpty())) {
|
||||
|
||||
rowsOfCells.add(row);
|
||||
}
|
||||
prevY = y;
|
||||
}
|
||||
|
||||
Collections.reverse(rowsOfCells);
|
||||
|
||||
// now cells are removed which are part of a column without any text blocks
|
||||
// this is done by first computing the inverse matrix which contains call columns of cells
|
||||
// then the column indices that have to be removed are determined
|
||||
List<List<Cell>> columnsOfCells = new ArrayList<>();
|
||||
int maxRowLength = rowsOfCells.stream()
|
||||
.map(List::size)
|
||||
.max(java.util.Comparator.naturalOrder())
|
||||
.orElse(0);
|
||||
for (int i = 0; i < maxRowLength; i++) {
|
||||
columnsOfCells.add(new ArrayList<>());
|
||||
}
|
||||
|
||||
for (List<Cell> row : rowsOfCells) {
|
||||
for (int j = 0; j < row.size(); j++) {
|
||||
columnsOfCells.get(j).add(row.get(j));
|
||||
}
|
||||
}
|
||||
|
||||
List<Integer> columnIndicesToRemove = new ArrayList<>();
|
||||
int columnIndex = 0;
|
||||
for (List<Cell> col : columnsOfCells) {
|
||||
if (col.stream()
|
||||
.allMatch(cell -> cell.getTextBlocks().isEmpty())) {
|
||||
columnIndicesToRemove.add(columnIndex);
|
||||
}
|
||||
columnIndex++;
|
||||
}
|
||||
columnIndicesToRemove.sort(Collections.reverseOrder());
|
||||
|
||||
// update all rows so that the values of the empty columns get removed
|
||||
var rowsOfCellsBefore = new ArrayList<>(rowsOfCells);
|
||||
rowsOfCells = new ArrayList<>();
|
||||
for (List<Cell> row : rowsOfCellsBefore) {
|
||||
var updatedRow = new ArrayList<>(row);
|
||||
columnIndicesToRemove.forEach(idxToRemove -> updatedRow.remove(updatedRow.get(idxToRemove)));
|
||||
rowsOfCells.add(updatedRow);
|
||||
}
|
||||
|
||||
return rowsOfCells;
|
||||
}
|
||||
|
||||
|
||||
private void addCellToRowAndCol(Cell cell, int row, int col) {
|
||||
|
||||
unrotatedRowCount = Math.max(unrotatedRowCount, row + 1);
|
||||
unrotatedColCount = Math.max(unrotatedColCount, col + 1);
|
||||
|
||||
CellPosition cp = new CellPosition(row, col);
|
||||
cellTreeMap.put(cp, cell);
|
||||
public List<Cell> getCells() {
|
||||
|
||||
return getRows().stream()
|
||||
.flatMap(List::stream)
|
||||
.collect(Collectors.toList());
|
||||
}
|
||||
|
||||
|
||||
@ -360,7 +108,7 @@ public class TablePageBlock extends AbstractPageBlock {
|
||||
}
|
||||
if (column != null && column.getTextBlocks() != null) {
|
||||
boolean first = true;
|
||||
for (TextPageBlock textBlock : column.getTextBlocks()) {
|
||||
for (AbstractPageBlock textBlock : column.getTextBlocks()) {
|
||||
if (!first) {
|
||||
sb.append("\n");
|
||||
}
|
||||
@ -392,7 +140,7 @@ public class TablePageBlock extends AbstractPageBlock {
|
||||
sb.append(i == 0 ? "\n<th>" : "\n<td>");
|
||||
if (column != null && column.getTextBlocks() != null) {
|
||||
boolean first = true;
|
||||
for (TextPageBlock textBlock : column.getTextBlocks()) {
|
||||
for (AbstractPageBlock textBlock : column.getTextBlocks()) {
|
||||
if (!first) {
|
||||
sb.append("<br />");
|
||||
}
|
||||
@ -411,9 +159,4 @@ public class TablePageBlock extends AbstractPageBlock {
|
||||
return sb.toString();
|
||||
}
|
||||
|
||||
|
||||
record CellWithIntersection(Cell originalCell, double intersectedArea) {
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
@ -1,9 +1,7 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.model;
|
||||
package com.knecon.fforesight.service.layoutparser.processor.model.text;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
|
||||
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Data;
|
||||
|
||||
@ -1,9 +1,7 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.model;
|
||||
package com.knecon.fforesight.service.layoutparser.processor.model.text;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
|
||||
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Data;
|
||||
|
||||
@ -7,8 +7,7 @@ import org.apache.pdfbox.text.TextPosition;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.TextBoundingBox;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.docstrum.utils.FastAtan2;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.CoordinateTransforms;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.PageInformation;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations;
|
||||
|
||||
import lombok.AccessLevel;
|
||||
import lombok.AllArgsConstructor;
|
||||
@ -65,7 +64,7 @@ public class RedTextPosition extends TextBoundingBox {
|
||||
pos.setBBoxDirAdj(dirAdjPosition);
|
||||
|
||||
AffineTransform affineTransform = getRotationMatrix(TextDirection.fromDegrees(textPosition.getDir()), textPosition.getPageWidth(), textPosition.getPageHeight());
|
||||
Rectangle2D bBoxInitialUserSpace = affineTransform.createTransformedShape(dirAdjPosition).getBounds2D();
|
||||
Rectangle2D bBoxInitialUserSpace = RectangleTransformations.transform(dirAdjPosition, affineTransform);
|
||||
|
||||
pos.setBBoxPdf(bBoxInitialUserSpace); // These are definitely correct
|
||||
|
||||
|
||||
@ -2,47 +2,62 @@ package com.knecon.fforesight.service.layoutparser.processor.model.text;
|
||||
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collection;
|
||||
import java.util.Collections;
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import com.fasterxml.jackson.annotation.JsonIgnore;
|
||||
import com.iqser.red.service.redaction.v1.server.model.document.nodes.LayoutEngine;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.Orientation;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.PageBlockType;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.TextNormalizationUtilities;
|
||||
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Builder;
|
||||
import lombok.Data;
|
||||
import lombok.EqualsAndHashCode;
|
||||
import lombok.NoArgsConstructor;
|
||||
|
||||
@EqualsAndHashCode(callSuper = true)
|
||||
@Data
|
||||
@AllArgsConstructor
|
||||
@Builder
|
||||
@NoArgsConstructor
|
||||
public class TextPageBlock extends AbstractPageBlock {
|
||||
|
||||
@Builder.Default
|
||||
@EqualsAndHashCode.Exclude
|
||||
private List<Word> words = new ArrayList<>();
|
||||
@Builder.Default
|
||||
@EqualsAndHashCode.Exclude
|
||||
private FrequencyCounters frequencyCounters = new FrequencyCounters();
|
||||
|
||||
private Rectangle2D bBoxDirAdj;
|
||||
|
||||
private boolean underlined;
|
||||
|
||||
private PageBlockType classification;
|
||||
|
||||
private boolean toDuplicate;
|
||||
|
||||
@EqualsAndHashCode.Exclude
|
||||
private String text;
|
||||
private boolean changed;
|
||||
|
||||
|
||||
public TextPageBlock(List<Word> words, int page, PageBlockType classification, Set<LayoutEngine> engines, Orientation orientation) {
|
||||
|
||||
this.page = page;
|
||||
this.classification = classification;
|
||||
this.engines = engines;
|
||||
this.orientation = orientation;
|
||||
setDefaultFields(words);
|
||||
}
|
||||
|
||||
|
||||
public TextPageBlock(List<Word> words) {
|
||||
|
||||
setDefaultFields(words);
|
||||
}
|
||||
|
||||
|
||||
private void setDefaultFields(List<Word> words) {
|
||||
|
||||
this.words = new ArrayList<>(words);
|
||||
this.frequencyCounters = new FrequencyCounters();
|
||||
|
||||
@ -73,10 +88,6 @@ public class TextPageBlock extends AbstractPageBlock {
|
||||
this.bBoxDirAdj = new Rectangle2D.Double();
|
||||
return;
|
||||
}
|
||||
this.bBoxDirAdj = words.stream()
|
||||
.map(Word::getBBoxDirAdj)
|
||||
.collect(RectangleTransformations.collectBBox());
|
||||
|
||||
setToBBoxOfComponents(words);
|
||||
}
|
||||
|
||||
@ -87,7 +98,7 @@ public class TextPageBlock extends AbstractPageBlock {
|
||||
}
|
||||
|
||||
|
||||
public static TextPageBlock merge(List<TextPageBlock> textBlocksToMerge) {
|
||||
public static TextPageBlock merge(Collection<TextPageBlock> textBlocksToMerge) {
|
||||
|
||||
if (textBlocksToMerge.isEmpty()) {
|
||||
throw new IllegalArgumentException("Need to provide at least one TextPageBlock.");
|
||||
@ -98,14 +109,33 @@ public class TextPageBlock extends AbstractPageBlock {
|
||||
.count() != 1) {
|
||||
throw new IllegalArgumentException("Cannot merge textBlocks on different pages.");
|
||||
}
|
||||
if (textBlocksToMerge.stream()
|
||||
.map(AbstractPageBlock::getClassification)
|
||||
.distinct()
|
||||
.count() != 1) {
|
||||
throw new IllegalArgumentException("Cannot merge textBlocks of different types.");
|
||||
}
|
||||
if (textBlocksToMerge.stream()
|
||||
.map(AbstractPageBlock::getDir)
|
||||
.distinct()
|
||||
.count() != 1) {
|
||||
throw new IllegalArgumentException("Cannot merge textBlocks of different directions.");
|
||||
}
|
||||
|
||||
List<Word> sequences = textBlocksToMerge.stream()
|
||||
.map(TextPageBlock::getWords)
|
||||
.flatMap(java.util.Collection::stream)
|
||||
.toList();
|
||||
sequences = new ArrayList<>(sequences);
|
||||
.flatMap(Collection::stream)
|
||||
.collect(Collectors.toList());
|
||||
|
||||
return new TextPageBlock(sequences);
|
||||
TextPageBlock first = textBlocksToMerge.iterator().next();
|
||||
return new TextPageBlock(sequences,
|
||||
first.getPage(),
|
||||
first.getClassification(),
|
||||
textBlocksToMerge.stream()
|
||||
.map(AbstractPageBlock::getEngines)
|
||||
.flatMap(Collection::stream)
|
||||
.collect(Collectors.toSet()),
|
||||
Orientation.NONE);
|
||||
}
|
||||
|
||||
|
||||
@ -172,6 +202,14 @@ public class TextPageBlock extends AbstractPageBlock {
|
||||
}
|
||||
|
||||
|
||||
public void removeAll(List<Word> words) {
|
||||
|
||||
changed = true;
|
||||
this.words.removeAll(words);
|
||||
setDefaultFields(this.words);
|
||||
}
|
||||
|
||||
|
||||
public TextPageBlock copy() {
|
||||
|
||||
return new TextPageBlock(new ArrayList<>(words));
|
||||
|
||||
@ -15,6 +15,7 @@ import org.apache.pdfbox.text.TextPosition;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.Character;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.TextBoundingBox;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations;
|
||||
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Builder;
|
||||
@ -66,9 +67,9 @@ public class Word extends TextBoundingBox implements CharSequence {
|
||||
}
|
||||
|
||||
|
||||
public Word(List<Character> textPositions, int page) {
|
||||
public Word(List<Character> characters, int page) {
|
||||
|
||||
this.characters = new ArrayList<>(textPositions);
|
||||
this.characters = new ArrayList<>(characters);
|
||||
this.page = page;
|
||||
calculateBBoxAndHashcode();
|
||||
}
|
||||
@ -101,12 +102,12 @@ public class Word extends TextBoundingBox implements CharSequence {
|
||||
@Override
|
||||
public Word subSequence(int start, int end) {
|
||||
|
||||
var textPositionSequence = new Word();
|
||||
textPositionSequence.characters = characters.subList(start, end);
|
||||
textPositionSequence.page = page;
|
||||
textPositionSequence.dir = dir;
|
||||
textPositionSequence.setToBBoxOfComponents(getTextPositions());
|
||||
return textPositionSequence;
|
||||
var word = new Word();
|
||||
word.characters = characters.subList(start, end);
|
||||
word.page = page;
|
||||
word.dir = dir;
|
||||
word.setToBBoxOfComponents(getTextPositions());
|
||||
return word;
|
||||
}
|
||||
|
||||
|
||||
@ -262,7 +263,7 @@ public class Word extends TextBoundingBox implements CharSequence {
|
||||
public void transform(AffineTransform rotateInstance) {
|
||||
|
||||
for (RedTextPosition textPosition : getTextPositions()) {
|
||||
Rectangle2D exactDirAdjCoordinates = rotateInstance.createTransformedShape(textPosition.getBBoxDirAdj()).getBounds2D();
|
||||
Rectangle2D exactDirAdjCoordinates = RectangleTransformations.transform(textPosition.getBBoxDirAdj(), rotateInstance);
|
||||
textPosition.setBBoxDirAdj(exactDirAdjCoordinates);
|
||||
}
|
||||
calculateBBoxAndHashcode();
|
||||
|
||||
@ -13,7 +13,10 @@ import com.iqser.red.service.redaction.v1.server.model.document.nodes.ImageType;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.image.ClassifiedImage;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.image.ImageMetadata;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.image.ImageServiceResponse;
|
||||
import com.knecon.fforesight.service.ocr.v1.api.model.Figure;
|
||||
import com.knecon.fforesight.service.ocr.v1.api.model.IdpResult;
|
||||
|
||||
import lombok.RequiredArgsConstructor;
|
||||
|
||||
@ -21,48 +24,78 @@ import lombok.RequiredArgsConstructor;
|
||||
@RequiredArgsConstructor
|
||||
public class ImageServiceResponseAdapter {
|
||||
|
||||
public Map<Integer, List<ClassifiedImage>> buildClassifiedImagesPerPage(ImageServiceResponse imageServiceResponse) {
|
||||
public Map<Integer, List<ClassifiedImage>> buildClassifiedImagesPerPage(ImageServiceResponse imageServiceResponse, IdpResult idpResult) {
|
||||
|
||||
Map<Integer, List<ClassifiedImage>> images = new HashMap<>();
|
||||
imageServiceResponse.getData().forEach(imageMetadata -> {
|
||||
var classification = imageMetadata.getFilters().isAllPassed() ? ImageType.valueOf(imageMetadata.getClassification()
|
||||
.getLabel()
|
||||
.toUpperCase(Locale.ROOT)) : ImageType.OTHER;
|
||||
images.computeIfAbsent(imageMetadata.getPosition().getPageNumber(), x -> new ArrayList<>())
|
||||
.add(new ClassifiedImage(new Rectangle2D.Double(imageMetadata.getPosition().getX1(),
|
||||
imageMetadata.getPosition().getY1(),
|
||||
imageMetadata.getGeometry().getWidth(),
|
||||
imageMetadata.getGeometry().getHeight()), classification, imageMetadata.isAlpha(), imageMetadata.getPosition().getPageNumber(),imageMetadata.getRepresentation()));
|
||||
});
|
||||
|
||||
// Currently This is a copy but, it will be changed later because i don' t think that we should unclassified images.
|
||||
imageServiceResponse.getDataCV().forEach(imageMetadata -> {
|
||||
var classification = imageMetadata.getFilters().isAllPassed() ? ImageType.valueOf(imageMetadata.getClassification()
|
||||
.getLabel()
|
||||
.toUpperCase(Locale.ROOT)) : ImageType.OTHER;
|
||||
images.computeIfAbsent(imageMetadata.getPosition().getPageNumber(), x -> new ArrayList<>())
|
||||
.add(new ClassifiedImage(new Rectangle2D.Double(imageMetadata.getPosition().getX1(),
|
||||
imageMetadata.getPosition().getY1(),
|
||||
imageMetadata.getGeometry().getWidth(),
|
||||
imageMetadata.getGeometry().getHeight()), classification, imageMetadata.isAlpha(), imageMetadata.getPosition().getPageNumber(),imageMetadata.getRepresentation()));
|
||||
});
|
||||
imageServiceResponse.getData()
|
||||
.forEach(imageMetadata -> addImageMetaData(imageMetadata, images));
|
||||
imageServiceResponse.getDataCV()
|
||||
.forEach(imageMetadata -> addImageMetaData(imageMetadata, images));
|
||||
idpResult.figures()
|
||||
.forEach(figure -> addFigure(figure, images));
|
||||
|
||||
return images;
|
||||
}
|
||||
|
||||
|
||||
private static void addFigure(Figure figure, Map<Integer, List<ClassifiedImage>> images) {
|
||||
|
||||
var classification = ImageType.GRAPHIC;
|
||||
ClassifiedImage image = new ClassifiedImage(figure.image().bbox().get().getBounds2D(), classification, false, figure.image().pageNumber(), "");
|
||||
getImagesOnPage(figure.image().pageNumber(), images).add(image);
|
||||
}
|
||||
|
||||
|
||||
private static void addImageMetaData(ImageMetadata imageMetadata, Map<Integer, List<ClassifiedImage>> images) {
|
||||
|
||||
var image = new ClassifiedImage(getPosition(imageMetadata),
|
||||
getImageType(imageMetadata),
|
||||
imageMetadata.isAlpha(),
|
||||
imageMetadata.getPosition().getPageNumber(),
|
||||
imageMetadata.getRepresentation());
|
||||
getImagesOnPage(imageMetadata.getPosition().getPageNumber(), images).add(image);
|
||||
}
|
||||
|
||||
|
||||
private static Rectangle2D.Double getPosition(ImageMetadata imageMetadata) {
|
||||
|
||||
return new Rectangle2D.Double(imageMetadata.getPosition().getX1(),
|
||||
imageMetadata.getPosition().getY1(),
|
||||
imageMetadata.getGeometry().getWidth(),
|
||||
imageMetadata.getGeometry().getHeight());
|
||||
}
|
||||
|
||||
|
||||
private static ImageType getImageType(ImageMetadata imageMetadata) {
|
||||
|
||||
if (imageMetadata.getFilters().isAllPassed()) {
|
||||
return ImageType.valueOf(imageMetadata.getClassification().getLabel().toUpperCase(Locale.ROOT));
|
||||
} else {
|
||||
return ImageType.OTHER;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
private static List<ClassifiedImage> getImagesOnPage(int pageNumber, Map<Integer, List<ClassifiedImage>> images) {
|
||||
|
||||
return images.computeIfAbsent(pageNumber, x -> new ArrayList<>());
|
||||
}
|
||||
|
||||
|
||||
public void findOcr(ClassificationPage classificationPage) {
|
||||
|
||||
classificationPage.getImages().forEach(image -> {
|
||||
if (image.getImageType().equals(ImageType.OTHER)) {
|
||||
for (AbstractPageBlock textblock : classificationPage.getTextBlocks()) {
|
||||
if (image.getPosition().contains(textblock.getBBoxPdf())) {
|
||||
image.setImageType(ImageType.OCR);
|
||||
return;
|
||||
classificationPage.getImages()
|
||||
.forEach(image -> {
|
||||
if (image.getImageType().equals(ImageType.OTHER)) {
|
||||
for (AbstractPageBlock textblock : classificationPage.getTextBlocks()) {
|
||||
if (image.getPosition().contains(textblock.getBBoxPdf())) {
|
||||
image.setImageType(ImageType.OCR);
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@ -183,7 +183,7 @@ public class BodyTextFrameService {
|
||||
if (cell == null || cell.getTextBlocks() == null) {
|
||||
continue;
|
||||
}
|
||||
for (TextPageBlock textBlock : cell.getTextBlocks()) {
|
||||
for (AbstractPageBlock textBlock : cell.getTextBlocks()) {
|
||||
expandRectangle(textBlock, page, expansionsRectangle);
|
||||
}
|
||||
}
|
||||
@ -198,7 +198,7 @@ public class BodyTextFrameService {
|
||||
}
|
||||
|
||||
|
||||
private void expandRectangle(TextPageBlock textBlock, ClassificationPage page, BodyTextFrameExpansionsRectangle expansionsRectangle) {
|
||||
private void expandRectangle(AbstractPageBlock textBlock, ClassificationPage page, BodyTextFrameExpansionsRectangle expansionsRectangle) {
|
||||
|
||||
if (page.getPageWidth() > page.getPageHeight() && page.getRotation() != 0) {
|
||||
if (textBlock.getPdfMinY() < expansionsRectangle.minX) {
|
||||
|
||||
@ -1,25 +0,0 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.services;
|
||||
|
||||
import java.awt.geom.Rectangle2D;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.LineInformation;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations;
|
||||
|
||||
import lombok.experimental.UtilityClass;
|
||||
|
||||
@UtilityClass
|
||||
public class MainBodyTextFrameExtractionService {
|
||||
|
||||
private static final double TEXT_FRAME_PAD_WIDTH = 0.0;
|
||||
private static final double TEXT_FRAME_PAD_HEIGHT = 0.02;
|
||||
|
||||
|
||||
public Rectangle2D calculateMainBodyTextFrame(LineInformation lineInformation) {
|
||||
|
||||
Rectangle2D mainBodyTextFrame = lineInformation.getLineBBox().stream()
|
||||
.collect(RectangleTransformations.collectBBox());
|
||||
|
||||
return RectangleTransformations.pad(mainBodyTextFrame, mainBodyTextFrame.getWidth() * TEXT_FRAME_PAD_WIDTH, mainBodyTextFrame.getHeight() * TEXT_FRAME_PAD_HEIGHT);
|
||||
}
|
||||
|
||||
}
|
||||
@ -1,73 +1,207 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.services;
|
||||
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
import java.util.Collection;
|
||||
import java.util.LinkedList;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.Iterator;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.concurrent.CountDownLatch;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import org.apache.pdfbox.Loader;
|
||||
import org.apache.pdfbox.pdmodel.PDDocument;
|
||||
import org.apache.pdfbox.pdmodel.PDPage;
|
||||
import org.springframework.core.io.ClassPathResource;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.BoundingBox;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.PageContents;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.PageInformation;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.Word;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.graphics.Box;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.graphics.FindGraphicsRaster;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.graphics.GraphicBBDetector;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.parsing.PDFLinesTextStripper;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations;
|
||||
|
||||
import lombok.experimental.UtilityClass;
|
||||
import lombok.AccessLevel;
|
||||
import lombok.Getter;
|
||||
import lombok.SneakyThrows;
|
||||
import lombok.experimental.FieldDefaults;
|
||||
|
||||
@UtilityClass
|
||||
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
|
||||
public class PageContentExtractor {
|
||||
|
||||
public List<PageContents> getSortedPageContents(String filename) throws IOException {
|
||||
static boolean USE_IMAGE_BASED_GRAPHIC_DETECTION;
|
||||
@Getter
|
||||
int pageCount;
|
||||
@Getter
|
||||
File document;
|
||||
|
||||
List<PageContents> textPositionSequencesPerPage = new LinkedList<>();
|
||||
ClassPathResource pdfResource = new ClassPathResource(filename);
|
||||
PageContents[] pageContents;
|
||||
CountDownLatch[] finishedLookup;
|
||||
List<List<Integer>> pageNumberBatches;
|
||||
|
||||
try (PDDocument pdDocument = Loader.loadPDF(pdfResource.getFile())) {
|
||||
|
||||
for (int pageNumber = 1; pageNumber < pdDocument.getNumberOfPages() + 1; pageNumber++) {
|
||||
public PageContentExtractor(File document, int threads) {
|
||||
|
||||
PDFLinesTextStripper stripper = new PDFLinesTextStripper();
|
||||
PDPage pdPage = pdDocument.getPage(pageNumber - 1);
|
||||
stripper.setPageNumber(pageNumber);
|
||||
stripper.setSortByPosition(true);
|
||||
stripper.setStartPage(pageNumber);
|
||||
stripper.setEndPage(pageNumber);
|
||||
stripper.setPdpage(pdPage);
|
||||
stripper.getText(pdDocument);
|
||||
|
||||
Map<Float, List<Word>> sortedTextPositionSequencesPerDir = stripper.getWords()
|
||||
.stream()
|
||||
.collect(Collectors.groupingBy(textPositionSequence -> textPositionSequence.getDir().getDegrees()));
|
||||
|
||||
var sortedTextPositionSequences = sortByDirAccordingToPageRotation(sortedTextPositionSequencesPerDir, pdPage.getRotation());
|
||||
|
||||
textPositionSequencesPerPage.add(new PageContents(sortedTextPositionSequences,
|
||||
RectangleTransformations.toRectangle2D(pdPage.getCropBox()),
|
||||
RectangleTransformations.toRectangle2D(pdPage.getMediaBox()),
|
||||
stripper.getRulings()));
|
||||
}
|
||||
this.document = document;
|
||||
this.pageCount = getPageCount(document);
|
||||
this.pageContents = new PageContents[pageCount];
|
||||
this.finishedLookup = new CountDownLatch[pageCount];
|
||||
for (int i = 0; i < pageCount; i++) {
|
||||
this.finishedLookup[i] = new CountDownLatch(1);
|
||||
}
|
||||
int actualThreads = Math.min(pageCount, threads);
|
||||
pageNumberBatches = new ArrayList<>(actualThreads);
|
||||
for (int i = 0; i < actualThreads; i++) {
|
||||
pageNumberBatches.add(new ArrayList<>(pageCount / actualThreads));
|
||||
}
|
||||
for (int i = 1; i <= pageCount; i++) {
|
||||
pageNumberBatches.get(i % actualThreads).add(i);
|
||||
}
|
||||
|
||||
return textPositionSequencesPerPage;
|
||||
}
|
||||
|
||||
|
||||
public List<Word> sortByDirAccordingToPageRotation(Map<Float, List<Word>> sortedTextPositionSequencesPerDir, int rotation) {
|
||||
@SneakyThrows
|
||||
private int getPageCount(File document) {
|
||||
|
||||
LinkedList<Float> sortedKeys = new LinkedList<>(sortedTextPositionSequencesPerDir.keySet().stream().sorted().toList());
|
||||
|
||||
for (int i = 0; i < sortedKeys.size(); i++) {
|
||||
if (sortedKeys.get(i) < rotation) {
|
||||
Float keyToSwap = sortedKeys.remove(i);
|
||||
sortedKeys.addLast(keyToSwap);
|
||||
}
|
||||
try (var doc = openDocument(document)) {
|
||||
return doc.getNumberOfPages();
|
||||
}
|
||||
return sortedKeys.stream().map(sortedTextPositionSequencesPerDir::get).flatMap(Collection::stream).toList();
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
public void startAsync() {
|
||||
|
||||
for (List<Integer> pageNumberBatch : pageNumberBatches) {
|
||||
Thread thread = new Thread(() -> extractPages(pageNumberBatch));
|
||||
thread.start();
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
private void extractPages(List<Integer> pageNumbers) {
|
||||
|
||||
var doc = openDocument(document);
|
||||
int count = 0;
|
||||
var pageGetter = new PageGetter(doc.getPages()
|
||||
.iterator(), pageCount);
|
||||
for (Integer pageNumber : pageNumbers) {
|
||||
count++;
|
||||
if (count % 100 == 0) {
|
||||
// As PDFBox caches all types of stuff, we need to close the document every once in a while to save on RAM
|
||||
doc.close();
|
||||
doc = openDocument(document);
|
||||
}
|
||||
|
||||
extractPage(pageNumber, doc, pageGetter.getPage(pageNumber));
|
||||
}
|
||||
doc.close();
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
private PDDocument openDocument(File originFile) {
|
||||
|
||||
PDDocument document = Loader.loadPDF(originFile);
|
||||
document.setAllSecurityToBeRemoved(true);
|
||||
return document;
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
public void extractPage(Integer pageNumber, PDDocument doc, PDPage pdPage) {
|
||||
|
||||
PDFLinesTextStripper stripper = new PDFLinesTextStripper();
|
||||
stripper.setPageNumber(pageNumber);
|
||||
stripper.setStartPage(pageNumber);
|
||||
stripper.setEndPage(pageNumber);
|
||||
stripper.setPdpage(pdPage);
|
||||
stripper.getText(doc);
|
||||
|
||||
PageInformation pageInformation = PageInformation.fromPDPage(pageNumber, pdPage);
|
||||
List<Word> words = stripper.getWords();
|
||||
List<Ruling> rulings = stripper.getRulings();
|
||||
List<Box> graphicBBoxes = findGraphicBBoxes(pageInformation, pdPage, doc, words);
|
||||
|
||||
pageContents[pageNumber - 1] = new PageContents(pageInformation, words, rulings, graphicBBoxes);
|
||||
finishedLookup[pageNumber - 1].countDown();
|
||||
}
|
||||
|
||||
|
||||
private static List<Box> findGraphicBBoxes(PageInformation pageInformation, PDPage pdPage, PDDocument doc, List<Word> words) throws IOException {
|
||||
|
||||
GraphicBBDetector graphicBBDetector = new GraphicBBDetector(pdPage, true);
|
||||
List<Box> graphicBBoxes = graphicBBDetector.findGraphicBB();
|
||||
|
||||
if (USE_IMAGE_BASED_GRAPHIC_DETECTION) {
|
||||
// This should only be used if ocr was performed, it is currently in an early stage and needs to be improved.
|
||||
List<Rectangle2D> wordIgnoreZones = words.stream()
|
||||
.map(BoundingBox::getBBoxPdf)
|
||||
.map(box -> RectangleTransformations.pad(box, 2, 2))
|
||||
.collect(Collectors.toList());
|
||||
graphicBBoxes.addAll(FindGraphicsRaster.findCCBoundingBoxes(doc, wordIgnoreZones, pageInformation));
|
||||
}
|
||||
return graphicBBoxes;
|
||||
}
|
||||
|
||||
|
||||
public PageContents awaitPageContents(Integer pageNumber) throws InterruptedException {
|
||||
|
||||
finishedLookup[pageNumber - 1].await();
|
||||
return pageContents[pageNumber - 1];
|
||||
}
|
||||
|
||||
|
||||
public List<PageContents> awaitAllContents() throws InterruptedException {
|
||||
|
||||
for (CountDownLatch countDownLatch : finishedLookup) {
|
||||
countDownLatch.await();
|
||||
}
|
||||
return Arrays.asList(pageContents);
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
public static List<PageContents> getDocumentContents(File document, int threads) {
|
||||
|
||||
PageContentExtractor extractor = new PageContentExtractor(document, threads);
|
||||
extractor.startAsync();
|
||||
return extractor.awaitAllContents();
|
||||
}
|
||||
|
||||
|
||||
private static class PageGetter {
|
||||
|
||||
Iterator<PDPage> pageIterator;
|
||||
int current;
|
||||
int max;
|
||||
|
||||
|
||||
PageGetter(Iterator<PDPage> pageIterator, int max) {
|
||||
|
||||
this.pageIterator = pageIterator;
|
||||
this.max = max;
|
||||
this.current = 0;
|
||||
}
|
||||
|
||||
|
||||
public PDPage getPage(int pageNumber) {
|
||||
|
||||
assert pageNumber >= current && pageNumber <= max;
|
||||
int pagesToIterate = pageNumber - current;
|
||||
PDPage page = null;
|
||||
for (int i = 0; i < pagesToIterate; i++) {
|
||||
page = pageIterator.next();
|
||||
}
|
||||
current = pageNumber;
|
||||
return page;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@ -1,24 +0,0 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.services;
|
||||
|
||||
import java.awt.geom.Rectangle2D;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.GapInformation;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.LineInformation;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.PageContents;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.PageInformation;
|
||||
|
||||
import lombok.experimental.UtilityClass;
|
||||
|
||||
@UtilityClass
|
||||
public class PageInformationService {
|
||||
|
||||
public PageInformation build(PageContents pageContents) {
|
||||
|
||||
LineInformation lineInformation = LineDetectionService.calculateLineInformation(pageContents.getSortedWords());
|
||||
Rectangle2D mainBodyTextFrame = MainBodyTextFrameExtractionService.calculateMainBodyTextFrame(lineInformation);
|
||||
GapInformation gapInformation = GapDetectionService.findGapsInLines(pageContents.getSortedWords(), mainBodyTextFrame);
|
||||
|
||||
return new PageInformation(pageContents, lineInformation, mainBodyTextFrame, gapInformation);
|
||||
}
|
||||
|
||||
}
|
||||
@ -5,18 +5,20 @@ import static com.knecon.fforesight.service.layoutparser.processor.utils.Geometr
|
||||
import java.awt.geom.Point2D;
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashMap;
|
||||
import java.util.Collection;
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Objects;
|
||||
import java.util.Set;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.UnionFind;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableCells;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.UnionFind;
|
||||
|
||||
import lombok.RequiredArgsConstructor;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
@ -52,22 +54,22 @@ public class RulingCleaningService {
|
||||
|
||||
private Rulings cleanRulings(Rulings rulings) {
|
||||
|
||||
List<List<Rectangle2D>> groupedOverlappingVerticalRectangles = groupOverlappingRectangles(rulings.verticalLines.stream()
|
||||
.map(RulingCleaningService::getOverlapRectangle)
|
||||
.distinct()
|
||||
.toList());
|
||||
var groupedOverlappingVerticalRectangles = groupOverlappingRectangles(rulings.verticalLines.stream()
|
||||
.map(RulingCleaningService::getOverlapRectangle)
|
||||
.distinct()
|
||||
.toList());
|
||||
List<Ruling> cleanedVerticalRulings = groupedOverlappingVerticalRectangles.stream()
|
||||
.map(rectList -> getXCenteredRuling(RectangleTransformations.rectangle2DBBox(rectList)))
|
||||
.map(RulingCleaningService::getXCenteredRuling)
|
||||
.filter(ruling -> ruling.length() > 0)
|
||||
.toList();
|
||||
|
||||
List<List<Rectangle2D>> groupedOverlappingHorizontalRectangles = groupOverlappingRectangles(rulings.horizontalLines.stream()
|
||||
.map(RulingCleaningService::getOverlapRectangle)
|
||||
.distinct()
|
||||
.toList());
|
||||
var groupedOverlappingHorizontalRectangles = groupOverlappingRectangles(rulings.horizontalLines.stream()
|
||||
.map(RulingCleaningService::getOverlapRectangle)
|
||||
.distinct()
|
||||
.toList());
|
||||
|
||||
List<Ruling> cleanedHorizontalRulings = groupedOverlappingHorizontalRectangles.stream()
|
||||
.map(rectList -> getYCenteredRuling(RectangleTransformations.rectangle2DBBox(rectList)))
|
||||
.map(RulingCleaningService::getYCenteredRuling)
|
||||
.filter(ruling -> ruling.length() > 0)
|
||||
.collect(Collectors.toList());
|
||||
|
||||
@ -75,13 +77,40 @@ public class RulingCleaningService {
|
||||
}
|
||||
|
||||
|
||||
private List<List<Rectangle2D>> groupOverlappingRectangles(List<Rectangle2D> rectangles) {
|
||||
private static Ruling getXCenteredRuling(Set<OverlapRectangle> rectList) {
|
||||
|
||||
UnionFind<Rectangle2D> unionFind = new UnionFind<>();
|
||||
Ruling ruling = getXCenteredRuling(rectList.stream()
|
||||
.map(OverlapRectangle::rectangle2D)
|
||||
.collect(RectangleTransformations.collectBBox()));
|
||||
ruling.setStyle(rectList.iterator().next().style);
|
||||
return ruling;
|
||||
}
|
||||
|
||||
|
||||
private static Ruling getYCenteredRuling(Set<OverlapRectangle> rectList) {
|
||||
|
||||
Ruling ruling = getYCenteredRuling(rectList.stream()
|
||||
.map(OverlapRectangle::rectangle2D)
|
||||
.collect(RectangleTransformations.collectBBox()));
|
||||
ruling.setStyle(rectList.iterator().next().style);
|
||||
return ruling;
|
||||
}
|
||||
|
||||
|
||||
private Collection<Set<OverlapRectangle>> groupOverlappingRectangles(List<OverlapRectangle> rectangles) {
|
||||
|
||||
UnionFind<OverlapRectangle> unionFind = new UnionFind<>(new HashSet<>(rectangles));
|
||||
for (int i = 0; i < rectangles.size(); i++) {
|
||||
for (int j = i + 1; j < rectangles.size(); j++) {
|
||||
Rectangle2D rectangle1 = rectangles.get(i);
|
||||
Rectangle2D rectangle2 = rectangles.get(j);
|
||||
|
||||
OverlapRectangle overlapRectangle1 = rectangles.get(i);
|
||||
OverlapRectangle overlapRectangle2 = rectangles.get(j);
|
||||
|
||||
if (!Objects.equals(overlapRectangle1.style, overlapRectangle2.style)) {
|
||||
continue;
|
||||
}
|
||||
Rectangle2D rectangle1 = overlapRectangle1.rectangle2D;
|
||||
Rectangle2D rectangle2 = overlapRectangle2.rectangle2D;
|
||||
|
||||
// we can stop early when we are too far off because of x-y-sorting
|
||||
if (rectangle1.getMaxX() < rectangle2.getMinX() && rectangle1.getMaxY() < rectangle2.getMinY()) {
|
||||
@ -89,21 +118,16 @@ public class RulingCleaningService {
|
||||
}
|
||||
|
||||
if (rectangle1.intersects(rectangle2)) {
|
||||
unionFind.union(rectangle1, rectangle2);
|
||||
unionFind.union(overlapRectangle1, overlapRectangle2);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Map<Rectangle2D, List<Rectangle2D>> groups = new HashMap<>();
|
||||
for (Rectangle2D rectangle : rectangles) {
|
||||
Rectangle2D root = unionFind.find(rectangle);
|
||||
groups.computeIfAbsent(root, k -> new ArrayList<>()).add(rectangle);
|
||||
}
|
||||
return new ArrayList<>(groups.values());
|
||||
return unionFind.getGroups();
|
||||
}
|
||||
|
||||
|
||||
private static Rectangle2D getOverlapRectangle(Ruling ruling) {
|
||||
private static OverlapRectangle getOverlapRectangle(Ruling ruling) {
|
||||
|
||||
float y;
|
||||
float x;
|
||||
@ -124,12 +148,14 @@ public class RulingCleaningService {
|
||||
y = ruling.y2;
|
||||
h = ruling.y1 - ruling.y2;
|
||||
}
|
||||
|
||||
Rectangle2D overlapRectangle;
|
||||
if (ruling.isHorizontal()) {
|
||||
return new Rectangle2D.Double(x - THRESHOLD_X_HORIZONTAL, y - THRESHOLD_Y_HORIZONTAL, w + 2 * THRESHOLD_X_HORIZONTAL, h + 2 * THRESHOLD_Y_HORIZONTAL);
|
||||
overlapRectangle = new Rectangle2D.Double(x - THRESHOLD_X_HORIZONTAL, y - THRESHOLD_Y_HORIZONTAL, w + 2 * THRESHOLD_X_HORIZONTAL, h + 2 * THRESHOLD_Y_HORIZONTAL);
|
||||
} else {
|
||||
return new Rectangle2D.Double(x - THRESHOLD_X_VERTICAL, y - THRESHOLD_Y_VERTICAL, w + 2 * THRESHOLD_X_VERTICAL, h + 2 * THRESHOLD_Y_VERTICAL);
|
||||
overlapRectangle = new Rectangle2D.Double(x - THRESHOLD_X_VERTICAL, y - THRESHOLD_Y_VERTICAL, w + 2 * THRESHOLD_X_VERTICAL, h + 2 * THRESHOLD_Y_VERTICAL);
|
||||
}
|
||||
|
||||
return new OverlapRectangle(overlapRectangle, ruling.getStyle());
|
||||
}
|
||||
|
||||
|
||||
@ -243,4 +269,8 @@ public class RulingCleaningService {
|
||||
|
||||
}
|
||||
|
||||
private record OverlapRectangle(Rectangle2D rectangle2D, Ruling.Style style) {
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@ -12,8 +12,8 @@ import org.springframework.stereotype.Service;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationDocument;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationFooter;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationHeader;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.ClassificationFooter;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.ClassificationHeader;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationSection;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.PageBlockType;
|
||||
@ -30,7 +30,6 @@ import lombok.extern.slf4j.Slf4j;
|
||||
@Deprecated
|
||||
public class SectionsBuilderService {
|
||||
|
||||
|
||||
public void buildSections(ClassificationDocument document) {
|
||||
|
||||
List<AbstractPageBlock> chunkWords = new ArrayList<>();
|
||||
@ -73,8 +72,7 @@ public class SectionsBuilderService {
|
||||
chunkBlockList.add(chunkBlock);
|
||||
chunkWords = new ArrayList<>();
|
||||
if (!chunkBlock.getTables().isEmpty()) {
|
||||
previousTable = chunkBlock.getTables()
|
||||
.get(chunkBlock.getTables().size() - 1);
|
||||
previousTable = chunkBlock.getTables().get(chunkBlock.getTables().size() - 1);
|
||||
}
|
||||
}
|
||||
if (current instanceof TablePageBlock table) {
|
||||
@ -236,12 +234,8 @@ public class SectionsBuilderService {
|
||||
List<Cell> previousTableNonHeaderRow = getRowWithNonHeaderCells(previousTable);
|
||||
List<Cell> tableNonHeaderRow = getRowWithNonHeaderCells(currentTable);
|
||||
// Allow merging of tables if header row is separated from first logical non-header row
|
||||
if (previousTableNonHeaderRow.isEmpty()
|
||||
&& previousTable.getRowCount() == 1
|
||||
&& previousTable.getRows()
|
||||
.get(0).size() == tableNonHeaderRow.size()) {
|
||||
previousTableNonHeaderRow = previousTable.getRows()
|
||||
.get(0)
|
||||
if (previousTableNonHeaderRow.isEmpty() && previousTable.getRowCount() == 1 && previousTable.getRows().get(0).size() == tableNonHeaderRow.size()) {
|
||||
previousTableNonHeaderRow = previousTable.getRows().get(0)
|
||||
.stream()
|
||||
.map(cell -> {
|
||||
Cell fakeCell = Cell.copy(cell);
|
||||
@ -252,8 +246,7 @@ public class SectionsBuilderService {
|
||||
}
|
||||
if (previousTableNonHeaderRow.size() == tableNonHeaderRow.size()) {
|
||||
for (int i = currentTable.getRowCount() - 1; i >= 0; i--) { // Non header rows are most likely at bottom of table
|
||||
List<Cell> row = currentTable.getRows()
|
||||
.get(i);
|
||||
List<Cell> row = currentTable.getRows().get(i);
|
||||
if (row.size() == tableNonHeaderRow.size() && row.stream()
|
||||
.allMatch(cell -> cell.getHeaderCells().isEmpty())) {
|
||||
for (int j = 0; j < row.size(); j++) {
|
||||
@ -272,13 +265,6 @@ public class SectionsBuilderService {
|
||||
|
||||
for (AbstractPageBlock container : wordBlockList) {
|
||||
if (container instanceof TablePageBlock table) {
|
||||
|
||||
if (lastHeadline == null || lastHeadline.isEmpty()) {
|
||||
table.setHeadline("Text in table");
|
||||
} else {
|
||||
table.setHeadline("TablePageBlock in: " + lastHeadline);
|
||||
}
|
||||
|
||||
section.getPageBlocks().add(table);
|
||||
continue;
|
||||
}
|
||||
@ -310,8 +296,7 @@ public class SectionsBuilderService {
|
||||
private List<Cell> getRowWithNonHeaderCells(TablePageBlock table) {
|
||||
|
||||
for (int i = table.getRowCount() - 1; i >= 0; i--) { // Non header rows are most likely at bottom of table
|
||||
List<Cell> row = table.getRows()
|
||||
.get(i);
|
||||
List<Cell> row = table.getRows().get(i);
|
||||
if (row.size() == 1) {
|
||||
continue;
|
||||
}
|
||||
|
||||
@ -1,159 +0,0 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.services;
|
||||
|
||||
import static com.knecon.fforesight.service.layoutparser.processor.utils.GeometricComparators.CELL_SIZE_COMPARATOR;
|
||||
import static com.knecon.fforesight.service.layoutparser.processor.utils.GeometricComparators.RECTANGLE_SIZE_COMPARATOR;
|
||||
|
||||
import java.awt.geom.AffineTransform;
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.BoundingBox;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.Cell;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.TablePageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.RedTextPosition;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.CoordinateTransforms;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.DoubleComparisons;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.PageInformation;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.RectangularIntersectionFinder;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.SpreadsheetFinder;
|
||||
|
||||
import lombok.SneakyThrows;
|
||||
|
||||
@Service
|
||||
public class TableExtractionService {
|
||||
|
||||
private static final int MAX_TABLE_CONTAINED_CELLS_WITH_TEXT = 1;
|
||||
private static final double TABLE_UNIFORMITY_THRESHOLD = 0.7;
|
||||
|
||||
|
||||
/**
|
||||
* Finds tables on a page and moves textblocks into cells of the found tables.
|
||||
* Note: This algorithm uses Pdf Coordinate System where {0,0} rotated with the page rotation.
|
||||
* 0 -> LowerLeft
|
||||
* 90 -> UpperLeft
|
||||
* 180 -> UpperRight
|
||||
* 270 -> LowerRight
|
||||
* <p>
|
||||
* DirAdj (Text direction adjusted) values can not be used here.
|
||||
*
|
||||
* @param emptyCells The cells used to build the table.
|
||||
* @param page Page object that contains textblocks and statistics.
|
||||
*/
|
||||
|
||||
public void extractTables(List<Cell> emptyCells, ClassificationPage page) {
|
||||
|
||||
// sort cells by size (height * width) ascending so that textBlocks are always assigned to the smallest cells that contain them
|
||||
emptyCells.sort(CELL_SIZE_COMPARATOR);
|
||||
|
||||
for (AbstractPageBlock abstractPageBlock : page.getTextBlocks()) {
|
||||
TextPageBlock textBlock = (TextPageBlock) abstractPageBlock;
|
||||
for (Cell cell : emptyCells) {
|
||||
if (cell.hasMinimumSize() && doesCellContainTextBlock(cell, textBlock)) {
|
||||
cell.addTextBlock(textBlock);
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
List<Cell> cells = new ArrayList<>(new HashSet<>(emptyCells));
|
||||
DoubleComparisons.sort(cells, BoundingBox.ILL_DEFINED_ORDER);
|
||||
|
||||
List<Rectangle2D> spreadsheetAreas = SpreadsheetFinder.findSpreadsheetsFromCells(cells);
|
||||
// sort spreadsheetAreas by size (height * width) ascending so that cells are placed in the smallest tables first
|
||||
// this way no cell duplication occurs when tables are contained in other tables and only the most inner table contains the cells
|
||||
spreadsheetAreas.sort(RECTANGLE_SIZE_COMPARATOR);
|
||||
|
||||
List<TablePageBlock> tables = new ArrayList<>();
|
||||
for (Rectangle2D area : spreadsheetAreas) {
|
||||
|
||||
List<Cell> containedCells = new ArrayList<>();
|
||||
for (Cell c : cells) {
|
||||
if (c.hasMinimumSize() && area.contains(c.getBBoxPdf())) {
|
||||
containedCells.add(c);
|
||||
}
|
||||
}
|
||||
|
||||
var containedCellsWithText = containedCells.stream()
|
||||
.filter(cell -> !cell.getTextBlocks().isEmpty())
|
||||
.toList();
|
||||
|
||||
// verify if table would contain fewer cells with text than the threshold allows
|
||||
if (containedCellsWithText.size() >= MAX_TABLE_CONTAINED_CELLS_WITH_TEXT && checkIfTableCellsAreUniform(containedCells)) {
|
||||
tables.add(new TablePageBlock(containedCells, page.getRotation()));
|
||||
cells.removeAll(containedCells);
|
||||
}
|
||||
}
|
||||
|
||||
for (TablePageBlock table : tables) {
|
||||
int position = -1;
|
||||
|
||||
for (AbstractPageBlock pageBlock : page.getTextBlocks()) {
|
||||
if (pageBlock instanceof TextPageBlock ? table.contains(pageBlock) : table.contains(pageBlock) && position == -1) {
|
||||
position = page.getTextBlocks().indexOf(pageBlock);
|
||||
}
|
||||
}
|
||||
if (position != -1) {
|
||||
page.getTextBlocks().add(position, table);
|
||||
|
||||
var toBeRemoved = table.getCells()
|
||||
.stream()
|
||||
.map(Cell::getTextBlocks)
|
||||
.flatMap(List::stream)
|
||||
.toList();
|
||||
// remove text blocks from the page that were also added with the table (from its contained cells)
|
||||
page.getTextBlocks().removeAll(toBeRemoved);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
private boolean checkIfTableCellsAreUniform(List<Cell> containedCells) {
|
||||
|
||||
if (containedCells.size() <= 2) {
|
||||
return true;
|
||||
}
|
||||
|
||||
Map<Long, List<Long>> cellsGroupedByRoundedWidth = containedCells.stream()
|
||||
.map(BoundingBox::getWidth)
|
||||
.map(size -> Math.round(size / 10.0) * 10)
|
||||
.collect(Collectors.groupingBy(Long::longValue));
|
||||
|
||||
return (double) cellsGroupedByRoundedWidth.size() / containedCells.size() <= TABLE_UNIFORMITY_THRESHOLD;
|
||||
}
|
||||
|
||||
|
||||
private boolean doesCellContainTextBlock(Cell cell, TextPageBlock textBlock) {
|
||||
|
||||
return cell.contains(textBlock, RedTextPosition.HEIGHT_PADDING);
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
public static List<Cell> findCells(List<Ruling> horizontalRulingLines, List<Ruling> verticalRulingLines, PageInformation pageInformation) {
|
||||
|
||||
AffineTransform affineTransform = CoordinateTransforms.calculateInitialUserSpaceCoordsToImageCoords(pageInformation, 1);
|
||||
/*
|
||||
switch (pageInformation.rotationDegrees()) {
|
||||
case 90 -> affineTransform.translate(RedTextPosition.HEIGHT_PADDING, 0); //although this is wrong, our text coordinates are wrong as well
|
||||
case 180 -> affineTransform.translate(0, RedTextPosition.HEIGHT_PADDING);
|
||||
case 270 -> affineTransform.translate(-RedTextPosition.HEIGHT_PADDING, 0);
|
||||
default -> affineTransform.translate(0, -RedTextPosition.HEIGHT_PADDING);
|
||||
}
|
||||
*/
|
||||
return RectangularIntersectionFinder.find(horizontalRulingLines, verticalRulingLines)
|
||||
.stream()
|
||||
.map(rect -> new Cell(rect, affineTransform))
|
||||
.collect(Collectors.toList());
|
||||
}
|
||||
|
||||
}
|
||||
@ -18,7 +18,7 @@ import com.knecon.fforesight.service.layoutparser.processor.model.SectionIdentif
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.outline.OutlineObject;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.Word;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.PageInformation;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.PageInformation;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.TextNormalizationUtilities;
|
||||
|
||||
import lombok.Data;
|
||||
|
||||
@ -0,0 +1,42 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.services.blockification;
|
||||
|
||||
import java.util.Collections;
|
||||
import java.util.List;
|
||||
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.Word;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.visualization.LayoutDebugLayer;
|
||||
|
||||
import lombok.AccessLevel;
|
||||
import lombok.RequiredArgsConstructor;
|
||||
import lombok.experimental.FieldDefaults;
|
||||
|
||||
@Service
|
||||
@RequiredArgsConstructor
|
||||
@FieldDefaults(makeFinal = true, level = AccessLevel.PRIVATE)
|
||||
public class BlockificationService {
|
||||
|
||||
RedactManagerBlockificationService redactManagerBlockificationService;
|
||||
DocstrumBlockificationService docstrumBlockificationService;
|
||||
DocuMineBlockificationService docuMineBlockificationService;
|
||||
|
||||
|
||||
public List<TextPageBlock> blockify(LayoutParsingType layoutParsingType, List<Word> words, CleanRulings cleanRulings, LayoutDebugLayer layoutDebugLayer) {
|
||||
|
||||
if (words.isEmpty()) {
|
||||
return Collections.emptyList();
|
||||
}
|
||||
return switch (layoutParsingType) {
|
||||
case REDACT_MANAGER_OLD -> redactManagerBlockificationService.blockify(words, cleanRulings, layoutDebugLayer);
|
||||
case DOCUMINE_OLD -> docuMineBlockificationService.blockify(words, cleanRulings);
|
||||
case DOCUMINE, REDACT_MANAGER, REDACT_MANAGER_PARAGRAPH_DEBUG, REDACT_MANAGER_WITHOUT_DUPLICATE_PARAGRAPH ->
|
||||
docstrumBlockificationService.blockify(words, cleanRulings, true, layoutDebugLayer, layoutParsingType);
|
||||
case CLARIFYND, CLARIFYND_PARAGRAPH_DEBUG -> docstrumBlockificationService.blockify(words, cleanRulings, false, layoutDebugLayer, layoutParsingType);
|
||||
};
|
||||
}
|
||||
|
||||
}
|
||||
@ -10,7 +10,6 @@ import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsi
|
||||
import com.knecon.fforesight.service.layoutparser.processor.docstrum.DocstrumSegmentationService;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.Zone;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.TablePageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
|
||||
@ -30,46 +29,39 @@ public class DocstrumBlockificationService {
|
||||
static final float THRESHOLD = 1f;
|
||||
|
||||
|
||||
public ClassificationPage blockify(List<Word> textPositions,
|
||||
CleanRulings rulings,
|
||||
boolean xyOrder,
|
||||
LayoutDebugLayer visualizations,
|
||||
LayoutParsingType layoutParsingType) {
|
||||
public List<TextPageBlock> blockify(List<Word> words, CleanRulings rulings, boolean xyOrder, LayoutDebugLayer visualizations, LayoutParsingType layoutParsingType) {
|
||||
|
||||
CleanRulings usedRulings = rulings.withoutTextRulings();
|
||||
CleanRulings rulingsWithoutTextRulings = rulings.withoutTextRulings();
|
||||
|
||||
List<Zone> zones = docstrumSegmentationService.segmentPage(textPositions, xyOrder, usedRulings, visualizations);
|
||||
List<Zone> zones = docstrumSegmentationService.segmentPage(words, xyOrder, rulingsWithoutTextRulings);
|
||||
|
||||
if (!textPositions.isEmpty()) {
|
||||
visualizations.addZoneVisualizations(zones, textPositions.get(0).getPage());
|
||||
visualizations.addLineVisualizationsFromZones(zones, textPositions.get(0).getPage());
|
||||
visualizations.addCharactersWithNeighbours(zones, textPositions.get(0).getPage());
|
||||
if (!words.isEmpty() && visualizations != null) {
|
||||
visualizations.addZoneVisualizations(zones, words.get(0).getPage());
|
||||
visualizations.addLineVisualizationsFromZones(zones, words.get(0).getPage());
|
||||
visualizations.addCharactersWithNeighbours(zones, words.get(0).getPage());
|
||||
}
|
||||
|
||||
var pageBlocks = toAbstractPageBlocks(zones);
|
||||
|
||||
var classificationPage = new ClassificationPage(pageBlocks);
|
||||
classificationPage.setCleanRulings(rulings);
|
||||
|
||||
mergeIntersectingBlocks(classificationPage, usedRulings, 0, 0);
|
||||
mergeIntersectingBlocks(pageBlocks, rulingsWithoutTextRulings, 0, 0);
|
||||
|
||||
if (layoutParsingType == LayoutParsingType.DOCUMINE
|
||||
|| layoutParsingType == LayoutParsingType.REDACT_MANAGER
|
||||
|| layoutParsingType == LayoutParsingType.REDACT_MANAGER_WITHOUT_DUPLICATE_PARAGRAPH) {
|
||||
combineBlocks(classificationPage, layoutParsingType);
|
||||
combineBlocks(pageBlocks, rulings, layoutParsingType);
|
||||
}
|
||||
|
||||
if (layoutParsingType == LayoutParsingType.CLARIFYND) {
|
||||
mergeIntersectingBlocks(classificationPage, usedRulings, 0, 0);
|
||||
mergeIntersectingBlocks(pageBlocks, rulingsWithoutTextRulings, 0, 0);
|
||||
}
|
||||
|
||||
return classificationPage;
|
||||
return pageBlocks;
|
||||
}
|
||||
|
||||
|
||||
private List<AbstractPageBlock> toAbstractPageBlocks(List<Zone> zones) {
|
||||
private List<TextPageBlock> toAbstractPageBlocks(List<Zone> zones) {
|
||||
|
||||
List<AbstractPageBlock> abstractPageBlocks = new ArrayList<>();
|
||||
List<TextPageBlock> abstractPageBlocks = new ArrayList<>();
|
||||
zones.forEach(zone -> {
|
||||
|
||||
List<Word> words = new ArrayList<>();
|
||||
@ -88,29 +80,23 @@ public class DocstrumBlockificationService {
|
||||
}
|
||||
|
||||
|
||||
public void combineBlocks(ClassificationPage page, LayoutParsingType layoutParsingType) {
|
||||
public void combineBlocks(List<TextPageBlock> blocks, CleanRulings rulingsWithoutTextRulings, LayoutParsingType layoutParsingType) {
|
||||
|
||||
TextPageBlock previous = new TextPageBlock();
|
||||
ListIterator<AbstractPageBlock> itty = page.getTextBlocks().listIterator();
|
||||
CleanRulings usedRulings = page.getCleanRulings().withoutTextRulings();
|
||||
ListIterator<TextPageBlock> itty = blocks.listIterator();
|
||||
while (itty.hasNext()) {
|
||||
|
||||
AbstractPageBlock block = itty.next();
|
||||
if (block instanceof TablePageBlock) {
|
||||
previous = new TextPageBlock();
|
||||
continue;
|
||||
}
|
||||
TextPageBlock current = (TextPageBlock) block;
|
||||
TextPageBlock current = itty.next();
|
||||
|
||||
if (previous != null && !previous.getWords().isEmpty()) {
|
||||
|
||||
if (current.getDir() != previous.getDir() || usedRulings.lineBetween(current, previous)) {
|
||||
if (current.getDir() != previous.getDir() || rulingsWithoutTextRulings.lineBetween(current, previous)) {
|
||||
previous = current;
|
||||
continue;
|
||||
}
|
||||
|
||||
if (current.isHeadline() || previous.isHeadline()) {
|
||||
if (intersectsYWithPreviousHavingMaxOneLine(previous, current, page)) {
|
||||
if (intersectsYWithPreviousHavingMaxOneLine(previous, current)) {
|
||||
previous = combineBlocksAndResetIterator(previous, current, itty, false);
|
||||
} else {
|
||||
previous = current;
|
||||
@ -119,7 +105,7 @@ public class DocstrumBlockificationService {
|
||||
continue;
|
||||
}
|
||||
|
||||
if (areTheOnlyTwoBlocksOnHeightsWithBothMoreThanTwoLines(previous, current, page)) {
|
||||
if (areTheOnlyTwoBlocksOnHeightsWithBothMoreThanTwoLines(previous, current, blocks)) {
|
||||
// previous = combineBlocksAndResetIterator(previous, current, itty, true);
|
||||
previous = combineBlocksAndResetIterator(previous, current, itty, layoutParsingType != LayoutParsingType.REDACT_MANAGER_WITHOUT_DUPLICATE_PARAGRAPH);
|
||||
continue;
|
||||
@ -130,12 +116,12 @@ public class DocstrumBlockificationService {
|
||||
continue;
|
||||
}
|
||||
|
||||
if (isSameTopOrBottomWithPreviousHavingMaxTwoLinesAndCurrentThanOneAndMax4OtherBlocksOnHeight(previous, current, page)) {
|
||||
if (isSameTopOrBottomWithPreviousHavingMaxTwoLinesAndCurrentThanOneAndMax4OtherBlocksOnHeight(previous, current, blocks)) {
|
||||
previous = combineBlocksAndResetIterator(previous, current, itty, previous.isToDuplicate());
|
||||
continue;
|
||||
}
|
||||
|
||||
if (isOnlyIntersectingYAndOnelineOrPrevoiusTwoLines(previous, current, page)) {
|
||||
if (isOnlyIntersectingYAndOnelineOrPrevoiusTwoLines(previous, current, blocks)) {
|
||||
previous = combineBlocksAndResetIterator(previous, current, itty, previous.isToDuplicate());
|
||||
continue;
|
||||
}
|
||||
@ -144,43 +130,43 @@ public class DocstrumBlockificationService {
|
||||
previous = current;
|
||||
}
|
||||
|
||||
mergeIntersectingBlocks(page, usedRulings, 0, Y_THRESHOLD);
|
||||
mergeIntersectingBlocks(blocks, rulingsWithoutTextRulings, 0, Y_THRESHOLD);
|
||||
}
|
||||
|
||||
|
||||
private boolean isOnlyIntersectingYAndOnelineOrPrevoiusTwoLines(TextPageBlock previous, TextPageBlock current, ClassificationPage page) {
|
||||
private boolean isOnlyIntersectingYAndOnelineOrPrevoiusTwoLines(TextPageBlock previous, TextPageBlock current, List<? extends AbstractPageBlock> allBlocks) {
|
||||
|
||||
return current.intersectsY(previous) //
|
||||
&& (previous.getNumberOfLines() == 1 && current.getNumberOfLines() >= 1 || previous.getNumberOfLines() == 2 && current.getNumberOfLines() == 1) //
|
||||
&& numberOfYIntersectionsOfSmallerBlocksWithOtherBlocks(current, previous, page.getTextBlocks()) <= 0;
|
||||
&& numberOfYIntersectionsOfSmallerBlocksWithOtherBlocks(current, previous, allBlocks) <= 0;
|
||||
}
|
||||
|
||||
|
||||
private boolean isSameTopOrBottomWithPreviousHavingMaxTwoLinesAndCurrentThanOneAndMax4OtherBlocksOnHeight(TextPageBlock previous,
|
||||
TextPageBlock current,
|
||||
ClassificationPage page) {
|
||||
List<? extends AbstractPageBlock> allBlocks) {
|
||||
|
||||
return (Math.abs(previous.getMaxY() - current.getMaxY()) < THRESHOLD || Math.abs(previous.getMinY() - current.getMinY()) < THRESHOLD) //
|
||||
&& (previous.getNumberOfLines() == 1 && current.getNumberOfLines() >= 1 || previous.getNumberOfLines() >= 2 && current.getNumberOfLines() == 1) //
|
||||
&& !hasBetween(current, previous, page.getTextBlocks()) && numberOfYIntersectionsOfSmallerBlocksWithOtherBlocks(current, previous, page.getTextBlocks()) <= 4;
|
||||
&& !hasBetween(current, previous, allBlocks) && numberOfYIntersectionsOfSmallerBlocksWithOtherBlocks(current, previous, allBlocks) <= 4;
|
||||
}
|
||||
|
||||
|
||||
private boolean intersectsYWithPreviousHavingMaxOneLine(TextPageBlock previous, TextPageBlock current, ClassificationPage page) {
|
||||
private boolean intersectsYWithPreviousHavingMaxOneLine(TextPageBlock previous, TextPageBlock current) {
|
||||
|
||||
return previous.intersectsY(current) && (previous.getNumberOfLines() == 1 && current.getNumberOfLines() >= 1);
|
||||
}
|
||||
|
||||
|
||||
private boolean areTheOnlyTwoBlocksOnHeightsWithBothMoreThanTwoLines(TextPageBlock previous, TextPageBlock current, ClassificationPage page) {
|
||||
private boolean areTheOnlyTwoBlocksOnHeightsWithBothMoreThanTwoLines(TextPageBlock previous, TextPageBlock current, List<TextPageBlock> allBlocks) {
|
||||
|
||||
return previous.getNumberOfLines() >= 2 && current.getNumberOfLines() >= 2 //
|
||||
&& previous.intersectsY(current) //
|
||||
&& numberOfYIntersectionsOfSmallerBlocksWithOtherBlocks(current, previous, page.getTextBlocks()) == 0;
|
||||
&& numberOfYIntersectionsOfSmallerBlocksWithOtherBlocks(current, previous, allBlocks) == 0;
|
||||
}
|
||||
|
||||
|
||||
private TextPageBlock combineBlocksAndResetIterator(TextPageBlock previous, TextPageBlock current, ListIterator<AbstractPageBlock> itty, boolean toDuplicate) {
|
||||
private TextPageBlock combineBlocksAndResetIterator(TextPageBlock previous, TextPageBlock current, ListIterator<TextPageBlock> itty, boolean toDuplicate) {
|
||||
|
||||
previous.addAll(current.getWords());
|
||||
previous = buildTextBlock(previous.getWords(), 0);
|
||||
@ -196,7 +182,7 @@ public class DocstrumBlockificationService {
|
||||
}
|
||||
|
||||
|
||||
private boolean hasBetween(TextPageBlock block, TextPageBlock other, List<AbstractPageBlock> allBlocks) {
|
||||
private boolean hasBetween(TextPageBlock block, TextPageBlock other, List<? extends AbstractPageBlock> allBlocks) {
|
||||
|
||||
for (AbstractPageBlock current : allBlocks) {
|
||||
|
||||
@ -213,7 +199,7 @@ public class DocstrumBlockificationService {
|
||||
}
|
||||
|
||||
|
||||
private int numberOfYIntersectionsOfSmallerBlocksWithOtherBlocks(TextPageBlock block, TextPageBlock other, List<AbstractPageBlock> allBlocks) {
|
||||
private int numberOfYIntersectionsOfSmallerBlocksWithOtherBlocks(TextPageBlock block, TextPageBlock other, List<? extends AbstractPageBlock> allBlocks) {
|
||||
|
||||
double minY = Math.min(block.getMinY(), other.getMinY());
|
||||
double maxY = Math.min(block.getMaxY(), other.getMaxY());
|
||||
@ -234,25 +220,18 @@ public class DocstrumBlockificationService {
|
||||
}
|
||||
|
||||
|
||||
public void mergeIntersectingBlocks(ClassificationPage page, CleanRulings usedRulings, float xThreshold, float yThreshold) {
|
||||
public void mergeIntersectingBlocks(List<TextPageBlock> blocks, CleanRulings usedRulings, float xThreshold, float yThreshold) {
|
||||
|
||||
var blocks = page.getTextBlocks();
|
||||
ListIterator<AbstractPageBlock> itty = blocks.listIterator();
|
||||
ListIterator<TextPageBlock> itty = blocks.listIterator();
|
||||
while (itty.hasNext()) {
|
||||
AbstractPageBlock block = itty.next();
|
||||
if (block == null) {
|
||||
continue;
|
||||
}
|
||||
if (block instanceof TablePageBlock) {
|
||||
TextPageBlock current = itty.next();
|
||||
if (current == null) {
|
||||
continue;
|
||||
}
|
||||
|
||||
if (block.getClassification() != null && block.getClassification().isHeadline()) {
|
||||
if (current.getClassification() != null && current.getClassification().isHeadline()) {
|
||||
continue;
|
||||
}
|
||||
|
||||
TextPageBlock current = (TextPageBlock) block;
|
||||
|
||||
for (int i = 0; i < blocks.size(); i++) {
|
||||
|
||||
AbstractPageBlock abstractPageBlock = blocks.get(i);
|
||||
|
||||
@ -33,14 +33,14 @@ public class DocuMineBlockificationService {
|
||||
* This method must use text direction adjusted postions (DirAdj). Where {0,0} is on the upper left. Never try to change this!
|
||||
* Rulings (Table lines) must be adjusted to the text directions as well, when checking if a block is split by a ruling.
|
||||
*
|
||||
* @param textPositions The textPositions of a page.
|
||||
* @param words The words of a page.
|
||||
* @param cleanRulings All rulings on a page
|
||||
* @return Page object that contains the Textblock and text statistics.
|
||||
*/
|
||||
public ClassificationPage blockify(List<Word> textPositions, CleanRulings cleanRulings) {
|
||||
public List<TextPageBlock> blockify(List<Word> words, CleanRulings cleanRulings) {
|
||||
|
||||
List<Word> chunkWords = new ArrayList<>();
|
||||
List<AbstractPageBlock> textPageBlocks = new ArrayList<>();
|
||||
List<TextPageBlock> textPageBlocks = new ArrayList<>();
|
||||
|
||||
CleanRulings usedRulings = cleanRulings.withoutTextRulings();
|
||||
|
||||
@ -52,7 +52,7 @@ public class DocuMineBlockificationService {
|
||||
|
||||
boolean wasSplitted = false;
|
||||
Double splitX1 = null;
|
||||
for (Word word : textPositions) {
|
||||
for (Word word : words) {
|
||||
|
||||
boolean lineSeparation = prev != null && word.getYDirAdj() - prev.getMaxYDirAdj() > Math.min(word.getHeight(), prev.getHeight()) * 1.1;
|
||||
boolean startFromTop = prev != null && word.getYDirAdj() < prev.getYDirAdj() - prev.getTextHeight();
|
||||
@ -120,7 +120,7 @@ public class DocuMineBlockificationService {
|
||||
|
||||
textPageBlocks.add(new TextPageBlock(chunkWords));
|
||||
|
||||
return new ClassificationPage(textPageBlocks);
|
||||
return textPageBlocks;
|
||||
}
|
||||
|
||||
|
||||
@ -171,8 +171,9 @@ public class DocuMineBlockificationService {
|
||||
continue;
|
||||
}
|
||||
|
||||
if (current.getDir() == inner.getDir() && current.intersects(inner, yThreshold, xThreshold) && (current.getClassification() == null || current.getClassification()
|
||||
.equals(inner.getClassification()))) {
|
||||
if (current.getDir() == inner.getDir() &&//
|
||||
current.intersects(inner, yThreshold, xThreshold) &&//
|
||||
(current.getClassification() == null || current.getClassification().equals(inner.getClassification()))) {
|
||||
|
||||
boolean toDuplicate = current.isToDuplicate() || inner.isToDuplicate();
|
||||
current.addAll(inner.getWords());
|
||||
|
||||
@ -26,24 +26,24 @@ public class RedactManagerBlockificationService {
|
||||
* This method must use text direction adjusted postions (DirAdj). Where {0,0} is on the upper left. Never try to change this!
|
||||
* Rulings (TablePageBlock lines) must be adjusted to the text directions as well, when checking if a block is split by a ruling.
|
||||
*
|
||||
* @param textPositions The words of a page.
|
||||
* @param words The words of a page.
|
||||
* @param visualizations
|
||||
* @return Page object that contains the Textblock and text statistics.
|
||||
*/
|
||||
public ClassificationPage blockify(List<Word> textPositions, CleanRulings cleanRulings, LayoutDebugLayer visualizations) {
|
||||
public List<TextPageBlock> blockify(List<Word> words, CleanRulings cleanRulings, LayoutDebugLayer visualizations) {
|
||||
|
||||
CleanRulings usedRulings = cleanRulings.withoutTextRulings();
|
||||
|
||||
int indexOnPage = 0;
|
||||
List<Word> chunkWords = new ArrayList<>();
|
||||
List<AbstractPageBlock> chunkBlockList = new ArrayList<>();
|
||||
List<TextPageBlock> chunkBlockList = new ArrayList<>();
|
||||
|
||||
double minX = 1000, maxX = 0, minY = 1000, maxY = 0;
|
||||
Word prev = null;
|
||||
|
||||
boolean wasSplitted = false;
|
||||
Double splitX1 = null;
|
||||
for (Word word : textPositions) {
|
||||
for (Word word : words) {
|
||||
|
||||
boolean lineSeparation = word.getYDirAdj() - maxY > word.getHeight() * 1.25;
|
||||
boolean startFromTop = prev != null && word.getYDirAdj() < prev.getYDirAdj() - prev.getTextHeight();
|
||||
@ -111,7 +111,7 @@ public class RedactManagerBlockificationService {
|
||||
chunkBlockList.add(cb1);
|
||||
}
|
||||
|
||||
Iterator<AbstractPageBlock> itty = chunkBlockList.iterator();
|
||||
Iterator<TextPageBlock> itty = chunkBlockList.iterator();
|
||||
|
||||
TextPageBlock previousLeft = null;
|
||||
TextPageBlock previousRight = null;
|
||||
@ -159,12 +159,12 @@ public class RedactManagerBlockificationService {
|
||||
|
||||
previous = block;
|
||||
}
|
||||
if (!textPositions.isEmpty()) {
|
||||
if (!words.isEmpty() && visualizations != null) {
|
||||
visualizations.addTextBlockVisualizations(chunkBlockList.stream()
|
||||
.toList(), textPositions.get(0).getPage());
|
||||
.toList(), words.get(0).getPage());
|
||||
}
|
||||
|
||||
return new ClassificationPage(chunkBlockList);
|
||||
return chunkBlockList;
|
||||
}
|
||||
|
||||
|
||||
|
||||
@ -19,7 +19,7 @@ import java.util.stream.Collectors;
|
||||
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.UnionFind;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.UnionFind;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationDocument;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
|
||||
|
||||
@ -5,7 +5,6 @@ import static java.util.stream.Collectors.groupingBy;
|
||||
import static java.util.stream.Collectors.toList;
|
||||
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collection;
|
||||
import java.util.HashMap;
|
||||
import java.util.LinkedList;
|
||||
@ -15,7 +14,6 @@ import java.util.NoSuchElementException;
|
||||
import java.util.Objects;
|
||||
import java.util.Optional;
|
||||
import java.util.Set;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import com.iqser.red.service.redaction.v1.server.model.document.DocumentTree;
|
||||
import com.iqser.red.service.redaction.v1.server.model.document.nodes.AbstractSemanticNode;
|
||||
@ -36,8 +34,8 @@ import com.iqser.red.service.redaction.v1.server.model.document.textblock.TextBl
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationDocument;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationFooter;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationHeader;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.ClassificationFooter;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.ClassificationHeader;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.DocumentWithVisualization;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.PageBlockType;
|
||||
@ -112,9 +110,7 @@ public class DocumentGraphFactory {
|
||||
|
||||
public void addParagraphOrHeadline(GenericSemanticNode parentNode,
|
||||
TextPageBlock originalTextBlock,
|
||||
Context context,
|
||||
List<TextPageBlock> textBlocksToMerge,
|
||||
LayoutParsingType layoutParsingType) {
|
||||
Context context, LayoutParsingType layoutParsingType) {
|
||||
|
||||
Page page = context.getPage(originalTextBlock.getPage());
|
||||
|
||||
@ -129,17 +125,10 @@ public class DocumentGraphFactory {
|
||||
node = Paragraph.builder().documentTree(context.getDocumentTree()).build();
|
||||
}
|
||||
|
||||
List<TextPageBlock> textBlocks = new ArrayList<>();
|
||||
textBlocks.add(originalTextBlock);
|
||||
textBlocks.addAll(textBlocksToMerge);
|
||||
|
||||
AtomicTextBlock textBlock = context.textBlockFactory.buildAtomicTextBlock(TextPositionOperations.mergeAndSort(textBlocks), node, context, page);
|
||||
AtomicTextBlock textBlock = context.textBlockFactory.buildAtomicTextBlock(TextPositionOperations.mergeAndSort(originalTextBlock), node, context, page);
|
||||
|
||||
if (node instanceof DuplicatedParagraph duplicatedParagraph) {
|
||||
AtomicTextBlock unsortedTextBlock = context.textBlockFactory.buildAtomicTextBlock(textBlocks.stream()
|
||||
.flatMap(tb -> tb.getWords()
|
||||
.stream())
|
||||
.collect(Collectors.toList()), node, context, page);
|
||||
AtomicTextBlock unsortedTextBlock = context.textBlockFactory.buildAtomicTextBlock(originalTextBlock.getWords(), node, context, page);
|
||||
duplicatedParagraph.setUnsortedLeafTextBlock(unsortedTextBlock);
|
||||
}
|
||||
|
||||
|
||||
@ -29,19 +29,19 @@ public class SearchTextWithTextPositionFactory {
|
||||
public static final double LINEBREAK_DELTA_TOLERANCE = 1.5;
|
||||
|
||||
|
||||
public SearchTextWithTextPositionDto buildSearchTextToTextPositionDto(List<Word> sequences) {
|
||||
public SearchTextWithTextPositionDto buildSearchTextToTextPositionDto(List<Word> words) {
|
||||
|
||||
if (sequences.isEmpty() || sequences.stream()
|
||||
if (words.isEmpty() || words.stream()
|
||||
.allMatch(sequence -> sequence.getCharacters().isEmpty())) {
|
||||
return SearchTextWithTextPositionDto.empty();
|
||||
}
|
||||
|
||||
Context context = new Context();
|
||||
|
||||
RedTextPosition currentTextPosition = sequences.get(0).getCharacters().get(0).getTextPosition();
|
||||
RedTextPosition currentTextPosition = words.get(0).getCharacters().get(0).getTextPosition();
|
||||
RedTextPosition previousTextPosition = RedTextPosition.builder().unicode(" ").bBoxDirAdj(currentTextPosition.getBBoxDirAdj()).build();
|
||||
|
||||
for (Word word : sequences) {
|
||||
for (Word word : words) {
|
||||
for (int i = 0; i < word.getCharacters().size(); ++i) {
|
||||
|
||||
currentTextPosition = word.getCharacters().get(i).getTextPosition();
|
||||
@ -66,7 +66,7 @@ public class SearchTextWithTextPositionFactory {
|
||||
++context.stringIdx;
|
||||
}
|
||||
|
||||
List<Rectangle2D> positions = sequences.stream()
|
||||
List<Rectangle2D> positions = words.stream()
|
||||
.map(Word::getCharacters)
|
||||
.flatMap(Collection::stream)
|
||||
.map(Character::getTextPosition)
|
||||
|
||||
@ -1,12 +1,12 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.services.factory;
|
||||
|
||||
import static java.lang.String.format;
|
||||
import static java.util.Collections.emptyList;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashSet;
|
||||
import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
import java.util.Objects;
|
||||
import java.util.Optional;
|
||||
import java.util.Set;
|
||||
|
||||
@ -17,12 +17,13 @@ import com.iqser.red.service.redaction.v1.server.model.document.nodes.Section;
|
||||
import com.iqser.red.service.redaction.v1.server.model.document.nodes.SuperSection;
|
||||
import com.iqser.red.service.redaction.v1.server.model.document.nodes.TableOfContents;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.UnionFind;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.image.ClassifiedImage;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.outline.SectionTreeEntry;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.TablePageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.TableMergingUtility;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.tables.TableMergingUtility;
|
||||
|
||||
import lombok.experimental.UtilityClass;
|
||||
|
||||
@ -60,7 +61,7 @@ public class SectionNodeFactory {
|
||||
|
||||
section.setTreeId(getTreeId(parentNode, context, section));
|
||||
|
||||
addFirstHeadlineDirectlyToSection(layoutParsingType, pageBlocks, context, section, document);
|
||||
addFirstHeadlineDirectlyToSection(layoutParsingType, pageBlocks, context, section);
|
||||
boolean containsTablesAndTextBlocks = containsTablesAndTextBlocks(pageBlocks);
|
||||
if (containsTablesAndTextBlocks) {
|
||||
splitPageBlocksIntoSubSections(pageBlocks).forEach(subSectionPageBlocks -> addSection(layoutParsingType,
|
||||
@ -73,8 +74,13 @@ public class SectionNodeFactory {
|
||||
} else if (type.equals(SectionTreeEntry.Type.SUPER_SECTION)) {
|
||||
// If a SuperSection contains more blocks than just a headline, we add a Section which contains the remaining textblocks.
|
||||
addSection(layoutParsingType, section, SectionTreeEntry.Type.SECTION, pageBlocks, emptyList(), context, document);
|
||||
} else if (!pageBlocks.isEmpty() && pageBlocks.get(0) instanceof TextPageBlock) {
|
||||
List<TextPageBlock> textPageBlocks = pageBlocks.stream()
|
||||
.map(block -> (TextPageBlock) block)
|
||||
.toList();
|
||||
addParagraphsAndHeadlinesToSection(layoutParsingType, textPageBlocks, context, section);
|
||||
} else {
|
||||
addTablesAndParagraphsAndHeadlinesToSection(layoutParsingType, pageBlocks, context, section, document);
|
||||
addTablesToSection(pageBlocks, context, section, document, layoutParsingType);
|
||||
}
|
||||
|
||||
images.stream()
|
||||
@ -85,6 +91,28 @@ public class SectionNodeFactory {
|
||||
}
|
||||
|
||||
|
||||
private static void addTablesToSection(List<AbstractPageBlock> pageBlocks,
|
||||
DocumentGraphFactory.Context context,
|
||||
AbstractSemanticNode section,
|
||||
Document document,
|
||||
LayoutParsingType layoutParsingType) {
|
||||
|
||||
List<AbstractPageBlock> remainingBlocks = new ArrayList<>(pageBlocks);
|
||||
Set<AbstractPageBlock> alreadyMerged = new HashSet<>();
|
||||
for (AbstractPageBlock abstractPageBlock : pageBlocks) {
|
||||
if (alreadyMerged.contains(abstractPageBlock)) {
|
||||
continue;
|
||||
}
|
||||
if (abstractPageBlock instanceof TablePageBlock tablePageBlock) {
|
||||
List<TablePageBlock> tablesToMerge = TableMergingUtility.findConsecutiveTablesWithSameColCountAndSameHeaders(tablePageBlock, remainingBlocks);
|
||||
alreadyMerged.addAll(tablesToMerge);
|
||||
remainingBlocks.removeAll(tablesToMerge);
|
||||
TableNodeFactory.addTable(layoutParsingType, section, tablesToMerge, context, document);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
private List<Integer> getTreeId(GenericSemanticNode parentNode, DocumentGraphFactory.Context context, AbstractSemanticNode section) {
|
||||
|
||||
if (parentNode == null) {
|
||||
@ -98,54 +126,63 @@ public class SectionNodeFactory {
|
||||
private void addFirstHeadlineDirectlyToSection(LayoutParsingType layoutParsingType,
|
||||
List<AbstractPageBlock> pageBlocks,
|
||||
DocumentGraphFactory.Context context,
|
||||
AbstractSemanticNode section,
|
||||
Document document) {
|
||||
AbstractSemanticNode section) {
|
||||
|
||||
if (pageBlocks.get(0).isHeadline()) {
|
||||
addTablesAndParagraphsAndHeadlinesToSection(layoutParsingType, List.of(pageBlocks.get(0)), context, section, document);
|
||||
addParagraphsAndHeadlinesToSection(layoutParsingType, List.of((TextPageBlock) pageBlocks.get(0)), context, section);
|
||||
pageBlocks.remove(0);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
private void addTablesAndParagraphsAndHeadlinesToSection(LayoutParsingType layoutParsingType,
|
||||
List<AbstractPageBlock> pageBlocks,
|
||||
DocumentGraphFactory.Context context,
|
||||
AbstractSemanticNode section,
|
||||
Document document) {
|
||||
private void addParagraphsAndHeadlinesToSection(LayoutParsingType layoutParsingType,
|
||||
List<TextPageBlock> pageBlocks,
|
||||
DocumentGraphFactory.Context context,
|
||||
AbstractSemanticNode section) {
|
||||
|
||||
Set<AbstractPageBlock> alreadyMerged = new HashSet<>();
|
||||
List<AbstractPageBlock> remainingBlocks = new LinkedList<>(pageBlocks);
|
||||
for (AbstractPageBlock abstractPageBlock : pageBlocks) {
|
||||
List<TextPageBlock> mergedPageBlocks = pageBlocks;
|
||||
if (pageBlocks.size() > 1 && (layoutParsingType.equals(LayoutParsingType.DOCUMINE_OLD) || layoutParsingType.equals(LayoutParsingType.REDACT_MANAGER_OLD))) {
|
||||
mergedPageBlocks = mergeBlocks(pageBlocks);
|
||||
}
|
||||
|
||||
if (alreadyMerged.contains(abstractPageBlock)) {
|
||||
continue;
|
||||
}
|
||||
for (TextPageBlock textPageBlock : mergedPageBlocks) {
|
||||
DocumentGraphFactory.addParagraphOrHeadline(section, textPageBlock, context, layoutParsingType);
|
||||
}
|
||||
}
|
||||
|
||||
remainingBlocks.removeAll(alreadyMerged);
|
||||
|
||||
if (abstractPageBlock instanceof TextPageBlock) {
|
||||
private static List<TextPageBlock> mergeBlocks(List<TextPageBlock> pageBlocks) {
|
||||
|
||||
switch (layoutParsingType) {
|
||||
case REDACT_MANAGER, DOCUMINE, CLARIFYND, CLARIFYND_PARAGRAPH_DEBUG, REDACT_MANAGER_WITHOUT_DUPLICATE_PARAGRAPH -> {
|
||||
alreadyMerged.add(abstractPageBlock);
|
||||
remainingBlocks.remove(abstractPageBlock);
|
||||
DocumentGraphFactory.addParagraphOrHeadline(section, (TextPageBlock) abstractPageBlock, context, new ArrayList<>(), layoutParsingType);
|
||||
}
|
||||
default -> {
|
||||
List<TextPageBlock> textBlocks = findTextBlocksWithSameClassificationAndAlignsY((TextPageBlock) abstractPageBlock, remainingBlocks);
|
||||
alreadyMerged.addAll(textBlocks);
|
||||
DocumentGraphFactory.addParagraphOrHeadline(section, (TextPageBlock) abstractPageBlock, context, textBlocks, layoutParsingType);
|
||||
}
|
||||
UnionFind<TextPageBlock> blockUnionFind = new UnionFind<>(new HashSet<>(pageBlocks));
|
||||
for (int i = 0; i < pageBlocks.size(); i++) {
|
||||
TextPageBlock textPageBlock1 = pageBlocks.get(i);
|
||||
for (int j = i; j < pageBlocks.size(); j++) {
|
||||
if (i == j) {
|
||||
continue;
|
||||
}
|
||||
} else if (abstractPageBlock instanceof TablePageBlock tablePageBlock) {
|
||||
List<TablePageBlock> tablesToMerge = TableMergingUtility.findConsecutiveTablesWithSameColCountAndSameHeaders(tablePageBlock, remainingBlocks);
|
||||
alreadyMerged.addAll(tablesToMerge);
|
||||
TableNodeFactory.addTable(layoutParsingType, section, tablesToMerge, context, document);
|
||||
} else {
|
||||
throw new RuntimeException(format("Unhandled AbstractPageBlockType %s!", abstractPageBlock.getClass()));
|
||||
var textPageBlock2 = pageBlocks.get(j);
|
||||
if (!Objects.equals(textPageBlock2.getPage(), textPageBlock1.getPage())) {
|
||||
continue;
|
||||
}
|
||||
if (!Objects.equals(textPageBlock2.getDir(), textPageBlock1.getDir())) {
|
||||
continue;
|
||||
}
|
||||
if (!Objects.equals(textPageBlock2.getClassification(), textPageBlock1.getClassification())) {
|
||||
continue;
|
||||
}
|
||||
if (!textPageBlock2.intersectsYPdf(textPageBlock1)) {
|
||||
continue;
|
||||
}
|
||||
if (textPageBlock2.isToDuplicate()) {
|
||||
continue;
|
||||
}
|
||||
blockUnionFind.union(textPageBlock2, textPageBlock1);
|
||||
}
|
||||
}
|
||||
return blockUnionFind.getGroups()
|
||||
.stream()
|
||||
.map(TextPageBlock::merge)
|
||||
.toList();
|
||||
}
|
||||
|
||||
|
||||
@ -222,18 +259,4 @@ public class SectionNodeFactory {
|
||||
return splitList;
|
||||
}
|
||||
|
||||
|
||||
private List<TextPageBlock> findTextBlocksWithSameClassificationAndAlignsY(TextPageBlock atc, List<AbstractPageBlock> pageBlocks) {
|
||||
|
||||
return pageBlocks.stream()
|
||||
.filter(abstractTextContainer -> !abstractTextContainer.equals(atc))
|
||||
.filter(abstractTextContainer -> abstractTextContainer.getPage() == atc.getPage())
|
||||
.filter(abstractTextContainer -> abstractTextContainer instanceof TextPageBlock)
|
||||
.filter(abstractTextContainer -> abstractTextContainer.intersectsYPdf(atc))
|
||||
.map(abstractTextContainer -> (TextPageBlock) abstractTextContainer)
|
||||
.filter(abstractTextContainer -> abstractTextContainer.getDir() == atc.getDir())
|
||||
.filter(abstractTextContainer -> !abstractTextContainer.isToDuplicate())
|
||||
.toList();
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@ -4,7 +4,6 @@ import static java.util.Collections.emptyList;
|
||||
|
||||
import java.util.Collection;
|
||||
import java.util.List;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import com.iqser.red.service.redaction.v1.server.model.document.nodes.Document;
|
||||
import com.iqser.red.service.redaction.v1.server.model.document.nodes.GenericSemanticNode;
|
||||
@ -17,6 +16,7 @@ import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBl
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.outline.SectionTreeEntry;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.Cell;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.TablePageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.Word;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.TextPositionOperations;
|
||||
|
||||
@ -50,8 +50,6 @@ public class TableNodeFactory {
|
||||
List<Integer> treeId = context.getDocumentTree().createNewChildEntryAndReturnId(parentNode, table);
|
||||
table.setTreeId(treeId);
|
||||
addTableCells(layoutParsingType, mergedRows, table, context, document);
|
||||
|
||||
ifTableHasNoHeadersSetFirstRowAsHeaders(table);
|
||||
}
|
||||
|
||||
|
||||
@ -76,16 +74,6 @@ public class TableNodeFactory {
|
||||
}
|
||||
|
||||
|
||||
private void ifTableHasNoHeadersSetFirstRowAsHeaders(Table table) {
|
||||
|
||||
if (table.streamHeaders()
|
||||
.findAny().isEmpty()) {
|
||||
table.streamRow(0)
|
||||
.forEach(tableCellNode -> tableCellNode.setHeader(true));
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
private void addTableCells(LayoutParsingType layoutParsingType, List<List<Cell>> rows, Table table, DocumentGraphFactory.Context context, Document document) {
|
||||
|
||||
for (int rowIndex = 0; rowIndex < rows.size(); rowIndex++) {
|
||||
@ -115,32 +103,32 @@ public class TableNodeFactory {
|
||||
TextBlock textBlock;
|
||||
if (cell.getTextBlocks().isEmpty()) {
|
||||
tableCell.setLeafTextBlock(context.getTextBlockFactory().emptyTextBlock(tableNode, context, page));
|
||||
} else if (cell.getTextBlocks().size() == 1) {
|
||||
textBlock = context.getTextBlockFactory().buildAtomicTextBlock(cell.getTextBlocks().get(0).getWords(), tableCell, context, page);
|
||||
} else if (cell.getTextBlocks().size() == 1 && cell.getTextBlocks().get(0) instanceof TextPageBlock textPageBlock) {
|
||||
textBlock = context.getTextBlockFactory().buildAtomicTextBlock(textPageBlock.getWords(), tableCell, context, page);
|
||||
tableCell.setLeafTextBlock(textBlock);
|
||||
} else if (firstTextBlockIsHeadline(cell)) {
|
||||
SectionNodeFactory.addSection(layoutParsingType,
|
||||
tableCell,
|
||||
SectionTreeEntry.Type.SECTION,
|
||||
cell.getTextBlocks()
|
||||
.stream()
|
||||
.map(tb -> (AbstractPageBlock) tb)
|
||||
.collect(Collectors.toList()),
|
||||
emptyList(),
|
||||
context,
|
||||
document);
|
||||
} else if (cellAreaIsSmallerThanPageAreaTimesThreshold(cell, page)) {
|
||||
List<Word> sequences = TextPositionOperations.mergeAndSort(cell.getTextBlocks());
|
||||
textBlock = context.getTextBlockFactory().buildAtomicTextBlock(sequences, tableCell, context, page);
|
||||
} else if (firstTextBlockIsHeadline(cell) || containsTables(cell.getTextBlocks())) {
|
||||
SectionNodeFactory.addSection(layoutParsingType, tableCell, SectionTreeEntry.Type.SECTION, cell.getTextBlocks(), emptyList(), context, document);
|
||||
} else if (cellAreaIsSmallerThanThreshold(cell, page)) {
|
||||
List<Word> words = TextPositionOperations.sort(cell.getWords());
|
||||
textBlock = context.getTextBlockFactory().buildAtomicTextBlock(words, tableCell, context, page);
|
||||
tableCell.setLeafTextBlock(textBlock);
|
||||
} else {
|
||||
cell.getTextBlocks()
|
||||
.forEach(tb -> DocumentGraphFactory.addParagraphOrHeadline(tableCell, tb, context, emptyList(), layoutParsingType));
|
||||
.stream()
|
||||
.map(block -> (TextPageBlock) block)
|
||||
.forEach(tb -> DocumentGraphFactory.addParagraphOrHeadline(tableCell, tb, context, layoutParsingType));
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
private boolean cellAreaIsSmallerThanPageAreaTimesThreshold(Cell cell, Page page) {
|
||||
private boolean containsTables(List<AbstractPageBlock> pageBlocks) {
|
||||
|
||||
return pageBlocks.stream()
|
||||
.anyMatch(pageBlock -> pageBlock instanceof TablePageBlock);
|
||||
}
|
||||
|
||||
|
||||
private boolean cellAreaIsSmallerThanThreshold(Cell cell, Page page) {
|
||||
|
||||
return cell.getArea() < TABLE_CELL_MERGE_CONTENTS_SIZE_THRESHOLD * page.getHeight() * page.getWidth();
|
||||
}
|
||||
|
||||
@ -18,16 +18,16 @@ public class TextBlockFactory {
|
||||
long textBlockIdx;
|
||||
|
||||
|
||||
public AtomicTextBlock buildAtomicTextBlock(List<Word> sequences, SemanticNode parent, DocumentGraphFactory.Context context, Page page) {
|
||||
public AtomicTextBlock buildAtomicTextBlock(List<Word> words, SemanticNode parent, DocumentGraphFactory.Context context, Page page) {
|
||||
|
||||
Integer numberOnPage = context.getAndIncrementTextBlockNumberOnPage(page);
|
||||
return buildAtomicTextBlock(sequences, parent, numberOnPage, page);
|
||||
return buildAtomicTextBlock(words, parent, numberOnPage, page);
|
||||
}
|
||||
|
||||
|
||||
public AtomicTextBlock buildAtomicTextBlock(List<Word> sequences, SemanticNode parent, Integer numberOnPage, Page page) {
|
||||
public AtomicTextBlock buildAtomicTextBlock(List<Word> words, SemanticNode parent, Integer numberOnPage, Page page) {
|
||||
|
||||
SearchTextWithTextPositionDto searchTextWithTextPositionDto = SearchTextWithTextPositionFactory.buildSearchTextToTextPositionDto(sequences);
|
||||
SearchTextWithTextPositionDto searchTextWithTextPositionDto = SearchTextWithTextPositionFactory.buildSearchTextToTextPositionDto(words);
|
||||
int offset = stringOffset;
|
||||
stringOffset += searchTextWithTextPositionDto.getSearchText().length();
|
||||
long idx = textBlockIdx;
|
||||
|
||||
@ -11,14 +11,15 @@ import java.util.stream.Collectors;
|
||||
import org.apache.pdfbox.pdmodel.PDDocument;
|
||||
import org.apache.pdfbox.rendering.ImageType;
|
||||
import org.apache.pdfbox.rendering.PDFRenderer;
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.CoordinateTransforms;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.PageInformation;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.PageInformation;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations;
|
||||
|
||||
import lombok.SneakyThrows;
|
||||
import lombok.experimental.UtilityClass;
|
||||
|
||||
@Service
|
||||
@UtilityClass
|
||||
public class FindGraphicsRaster {
|
||||
|
||||
// Pixels that are lighter then this threshold are ignored
|
||||
@ -33,7 +34,8 @@ public class FindGraphicsRaster {
|
||||
|
||||
var renderer = new PDFRenderer(doc);
|
||||
var img = renderer.renderImageWithDPI(pageInformation.number() - 1, DPI, ImageType.GRAY);
|
||||
var imageCtm = CoordinateTransforms.calculateImageCoordsToInitialUserSpaceCoords(pageInformation, CoordinateTransforms.calculateScalingFactor(pageInformation, img.getWidth()));
|
||||
var imageCtm = CoordinateTransforms.calculateImageCoordsToInitialUserSpaceCoords(pageInformation,
|
||||
CoordinateTransforms.calculateScalingFactor(pageInformation, img.getWidth()));
|
||||
return findCCBoundingBoxes(img, remove, THRESHOLD, DPI / 72, imageCtm);
|
||||
}
|
||||
|
||||
@ -47,13 +49,15 @@ public class FindGraphicsRaster {
|
||||
var w = image.getWidth();
|
||||
var pixels = new int[w * h];
|
||||
image.getRaster().getPixels(0, 0, w, h, pixels);
|
||||
remove.stream().map(rect -> inverseCTM.createTransformedShape(rect).getBounds2D()).forEach(box -> {
|
||||
for (int y = (int) Math.floor(box.getMinY() / rescale); y <= (int) Math.min(Math.ceil(box.getMaxY() / rescale), h); y++) {
|
||||
for (int x = (int) Math.floor(box.getMinX() / rescale); x <= (int) Math.min(Math.ceil(box.getMaxX() / rescale), w); x++) {
|
||||
pixels[w * y + x] = grayScaleTresh;
|
||||
}
|
||||
}
|
||||
});
|
||||
remove.stream()
|
||||
.map(rect -> RectangleTransformations.transform(rect, inverseCTM))
|
||||
.forEach(box -> {
|
||||
for (int y = (int) Math.floor(box.getMinY() / rescale); y <= (int) Math.min(Math.ceil(box.getMaxY() / rescale), h); y++) {
|
||||
for (int x = (int) Math.floor(box.getMinX() / rescale); x <= (int) Math.min(Math.ceil(box.getMaxX() / rescale), w); x++) {
|
||||
pixels[w * y + x] = grayScaleTresh;
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
// var image2 = createImageFromMatrix(pixels, w, h);
|
||||
|
||||
@ -130,8 +134,10 @@ public class FindGraphicsRaster {
|
||||
}
|
||||
}
|
||||
}
|
||||
return boundingBoxes.stream().filter(box -> box.area() > 0).map(box -> box.transform(imageCTM)).collect(Collectors.toList());
|
||||
return boundingBoxes.stream()
|
||||
.filter(box -> box.area() > 0)
|
||||
.map(box -> box.transform(imageCTM))
|
||||
.collect(Collectors.toList());
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
||||
@ -4,15 +4,14 @@ import java.awt.geom.Rectangle2D;
|
||||
import java.util.List;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import org.apache.pdfbox.pdmodel.PDDocument;
|
||||
import org.apache.pdfbox.pdmodel.PDPage;
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import com.iqser.red.service.redaction.v1.server.model.document.nodes.ImageType;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.BoundingBox;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.image.ClassifiedImage;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.Word;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.PageInformation;
|
||||
|
||||
import lombok.RequiredArgsConstructor;
|
||||
import lombok.SneakyThrows;
|
||||
@ -25,32 +24,13 @@ public class GraphicExtractorService {
|
||||
private static final int MIN_GRAPHICS_AREA = 500;
|
||||
|
||||
private final GraphicsClusteringService graphicsClusteringService;
|
||||
private final FindGraphicsRaster findGraphicsRaster;
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
public List<Box> extractPathElementGraphics(PDDocument pdDocument,
|
||||
PDPage pdPage,
|
||||
int pageNumber,
|
||||
CleanRulings cleanRulings,
|
||||
List<Word> words,
|
||||
boolean graphicsRaster) {
|
||||
public List<ClassifiedImage> extractPathElementGraphics(List<Box> graphicBBoxes, int pageNumber, CleanRulings cleanRulings) {
|
||||
|
||||
List<Box> characterBBoxes = getCharacterBBoxes(words);
|
||||
List<Box> classifiedRulingsBoxes = getLineBBoxesOfAllClassifiedRulings(cleanRulings);
|
||||
|
||||
GraphicBBDetector graphicBBDetector = new GraphicBBDetector(pdPage, true);
|
||||
List<Box> graphicBBoxes = graphicBBDetector.findGraphicBB();
|
||||
|
||||
if (graphicsRaster) {
|
||||
// This should only be used if ocr was performed, it is currently in an early stage and needs to be improved.
|
||||
graphicBBoxes.addAll(findGraphicsRaster.findCCBoundingBoxes(pdDocument,
|
||||
characterBBoxes.stream()
|
||||
.map(box -> new Rectangle2D.Double(box.x1 - 2, box.y1 - 2, box.width() + 4, box.height() + 4))
|
||||
.collect(Collectors.toList()),
|
||||
PageInformation.fromPDPage(pageNumber, pdPage)));
|
||||
}
|
||||
|
||||
List<Box> filteredGraphicBBoxes = graphicBBoxes.stream()
|
||||
.filter(box -> !box.intersectsAny(classifiedRulingsBoxes, 4))
|
||||
.collect(Collectors.toList());
|
||||
@ -59,19 +39,11 @@ public class GraphicExtractorService {
|
||||
|
||||
return clusters.stream()
|
||||
.filter(box -> box.area() > MIN_GRAPHICS_AREA && box.height() > MIN_GRAPHICS_SIDE_LENGTH && box.width() > MIN_GRAPHICS_SIDE_LENGTH)
|
||||
.map(g -> new ClassifiedImage(new Rectangle2D.Double(g.x1, g.y1, g.width(), g.height()), ImageType.GRAPHIC, false, pageNumber, ""))
|
||||
.toList();
|
||||
}
|
||||
|
||||
|
||||
private List<Box> getCharacterBBoxes(List<Word> words) {
|
||||
|
||||
return words.stream()
|
||||
.map(BoundingBox::getBBoxPdf)
|
||||
.map(Box::new)
|
||||
.collect(Collectors.toList());
|
||||
}
|
||||
|
||||
|
||||
private List<Box> getLineBBoxesOfAllClassifiedRulings(CleanRulings cleanRulings) {
|
||||
|
||||
return cleanRulings.buildAll()
|
||||
|
||||
@ -14,7 +14,7 @@ import com.iqser.red.service.redaction.v1.server.model.document.nodes.Section;
|
||||
import com.iqser.red.service.redaction.v1.server.model.document.nodes.SemanticNode;
|
||||
import com.iqser.red.service.redaction.v1.server.model.document.nodes.SuperSection;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.CoordinateTransforms;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.PageInformation;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.PageInformation;
|
||||
import com.knecon.fforesight.service.viewerdoc.model.Outline;
|
||||
|
||||
import lombok.SneakyThrows;
|
||||
|
||||
@ -199,11 +199,18 @@ public class PDFLinesTextStripper extends PDFTextStripper {
|
||||
}
|
||||
|
||||
|
||||
private void addVisibleRulings(List<Ruling> path, boolean stroke) throws IOException {
|
||||
private void addVisibleRulings(List<Ruling> path, boolean stroke) {
|
||||
|
||||
try {
|
||||
if (stroke && !getGraphicsState().getStrokingColor().isPattern() && isBlack(getGraphicsState().getStrokingColor()) || //
|
||||
!stroke && !getGraphicsState().getNonStrokingColor().isPattern() && isBlack(getGraphicsState().getNonStrokingColor())) {
|
||||
// see spec '8.4.3.6 Line dash pattern'
|
||||
var dashPattern = getGraphicsState().getLineDashPattern();
|
||||
if (dashPattern != null && dashPattern.getDashArray().length > 0) {
|
||||
path.forEach(r -> r.setStyle(Ruling.Style.DASHED));
|
||||
} else {
|
||||
path.forEach(r -> r.setStyle(Ruling.Style.SOLID));
|
||||
}
|
||||
rulings.addAll(path);
|
||||
}
|
||||
} catch (UnsupportedOperationException e) {
|
||||
@ -247,9 +254,7 @@ public class PDFLinesTextStripper extends PDFTextStripper {
|
||||
}
|
||||
|
||||
if (!words.isEmpty()) {
|
||||
previous = words.get(words.size() - 1)
|
||||
.getCharacters()
|
||||
.get(words.get(words.size() - 1).getCharacters().size() - 1).getTextPosition();
|
||||
previous = words.get(words.size() - 1).getCharacters().get(words.get(words.size() - 1).getCharacters().size() - 1).getTextPosition();
|
||||
}
|
||||
|
||||
if (i == 0 && (textPositions.get(i).getUnicode().equals(" ") || textPositions.get(i).getUnicode().equals("\u00A0") || textPositions.get(i).getUnicode().equals("\t"))) {
|
||||
|
||||
@ -0,0 +1,138 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.services.tables;
|
||||
|
||||
import java.awt.geom.AffineTransform;
|
||||
import java.awt.geom.Point2D;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collection;
|
||||
import java.util.Comparator;
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.Optional;
|
||||
import java.util.Set;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.UnionFind;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.Cell;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations;
|
||||
|
||||
import lombok.experimental.UtilityClass;
|
||||
|
||||
@UtilityClass
|
||||
public class AreaSweepGridifier {
|
||||
|
||||
public static final double CELL_AREA_CONTAINED_THRESHOLD = 0.8;
|
||||
public static final double MIN_SIZE_FACTOR = 0.5;
|
||||
|
||||
|
||||
/**
|
||||
* Calculates the grid structure of the table. For spanning rows and columns multiple cells with the same values will be inserted.
|
||||
* Works well for perfectly straight tables, but fails as soon as the tables are slightly rotated. Then the area sweep will drop some cells or duplicate them unnecessarily.
|
||||
*
|
||||
* @return TablePageBlock Structure as a rows of cells matrix
|
||||
*/
|
||||
public List<List<Cell>> gridify(Set<Cell> cells, AffineTransform pageToPdfTransform, double minCellWidth, double minCellHeight) {
|
||||
|
||||
if (cells.isEmpty()) {
|
||||
return new ArrayList<>();
|
||||
}
|
||||
|
||||
var colDividers = getColDividers(cells, minCellWidth);
|
||||
var rowDividers = getRowDividers(cells, minCellHeight);
|
||||
|
||||
List<List<Cell>> rowsOfCells = new ArrayList<>();
|
||||
|
||||
for (int i = 1; i < rowDividers.size(); i++) {
|
||||
double prevY = rowDividers.get(i - 1);
|
||||
double y = rowDividers.get(i);
|
||||
|
||||
List<Cell> row = new ArrayList<>();
|
||||
|
||||
for (int j = 1; j < colDividers.size(); j++) {
|
||||
double prevX = colDividers.get(j - 1);
|
||||
double x = colDividers.get(j);
|
||||
|
||||
var cellFromGridStructure = Cell.fromPageCoordinates(new Point2D.Double(prevX, prevY), new Point2D.Double(x, y), pageToPdfTransform);
|
||||
|
||||
if (!cellFromGridStructure.hasMinimumSize()) {
|
||||
continue;
|
||||
}
|
||||
|
||||
Optional<Cell> matchingCell = cells.stream()
|
||||
.map(originalCell -> new CellWithIntersection(originalCell,
|
||||
RectangleTransformations.calculateIntersectedArea(cellFromGridStructure.getBBox(), originalCell.getBBox())))
|
||||
.filter(cellWithIntersection -> cellWithIntersection.intersectedArea() > 0)
|
||||
.filter(cellWithIntersection -> cellFromGridStructure.getArea() > cellWithIntersection.intersectedArea * CELL_AREA_CONTAINED_THRESHOLD)
|
||||
.max(Comparator.comparing(CellWithIntersection::intersectedArea))
|
||||
.map(CellWithIntersection::originalCell);
|
||||
|
||||
if (matchingCell.isPresent()) {
|
||||
cellFromGridStructure.getTextBlocks().addAll(matchingCell.get().getTextBlocks());
|
||||
cellFromGridStructure.setHeaderCell(matchingCell.get().isHeaderCell());
|
||||
}
|
||||
|
||||
row.add(cellFromGridStructure);
|
||||
|
||||
}
|
||||
|
||||
rowsOfCells.add(row);
|
||||
}
|
||||
|
||||
return rowsOfCells;
|
||||
}
|
||||
|
||||
|
||||
private List<Double> getRowDividers(Collection<Cell> cells, double minCellHeight) {
|
||||
|
||||
Set<Double> uniqueY = new HashSet<>();
|
||||
cells.stream()
|
||||
.filter(c -> !c.getTextBlocks().isEmpty() || c.getHeight() > 3 && c.getWidth() > 3)
|
||||
.forEach(c -> {
|
||||
uniqueY.add(c.getMinY());
|
||||
uniqueY.add(c.getMaxY());
|
||||
});
|
||||
|
||||
return deduplicate(uniqueY, minCellHeight * MIN_SIZE_FACTOR);
|
||||
}
|
||||
|
||||
|
||||
private List<Double> getColDividers(Collection<Cell> cells, double minCellWidth) {
|
||||
|
||||
Set<Double> uniqueX = new HashSet<>();
|
||||
cells.stream()
|
||||
.filter(c -> !c.getTextBlocks().isEmpty() || c.getHeight() > 3 && c.getWidth() > 3)
|
||||
.forEach(c -> {
|
||||
uniqueX.add(c.getMinX());
|
||||
uniqueX.add(c.getMaxX());
|
||||
});
|
||||
|
||||
return deduplicate(uniqueX, minCellWidth * MIN_SIZE_FACTOR);
|
||||
}
|
||||
|
||||
|
||||
private List<Double> deduplicate(Set<Double> doubles, double minDistance) {
|
||||
// finds all doubles less than the minDistance apart and replaces them with their average
|
||||
UnionFind<Double> uf = new UnionFind<>(doubles);
|
||||
for (Double x : doubles) {
|
||||
for (Double x2 : doubles) {
|
||||
if (x.equals(x2)) {
|
||||
continue;
|
||||
}
|
||||
if (Math.abs(x - x2) < minDistance) {
|
||||
uf.union(x, x2);
|
||||
}
|
||||
}
|
||||
}
|
||||
return uf.getGroups()
|
||||
.stream()
|
||||
.map(xs -> xs.stream()
|
||||
.mapToDouble(Double::doubleValue).average()
|
||||
.orElseThrow())
|
||||
.sorted()
|
||||
.toList();
|
||||
}
|
||||
|
||||
|
||||
record CellWithIntersection(Cell originalCell, double intersectedArea) {
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
@ -1,4 +1,4 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.utils;
|
||||
package com.knecon.fforesight.service.layoutparser.processor.services.tables;
|
||||
|
||||
import static com.knecon.fforesight.service.layoutparser.processor.utils.GeometricComparators.Y_FIRST_POINT_COMPARATOR;
|
||||
|
||||
@ -14,15 +14,6 @@ public class RectangularIntersectionFinder {
|
||||
|
||||
public static List<Rectangle2D> find(List<Ruling> horizontalRulingLines, List<Ruling> verticalRulingLines) {
|
||||
|
||||
// // Fix for 211.pdf
|
||||
// for (Ruling r : horizontalRulingLines) {
|
||||
// if (r.getX2() < r.getX1()) {
|
||||
// double a = r.getX2();
|
||||
// r.x2 = (float) r.getX1();
|
||||
// r.x1 = (float) a;
|
||||
// }
|
||||
// }
|
||||
|
||||
List<Rectangle2D> foundRectangles = new ArrayList<>();
|
||||
Map<Point2D, RulingIntersectionFinder.IntersectingRulings> intersectionPoints = RulingIntersectionFinder.findNaive(horizontalRulingLines, verticalRulingLines);
|
||||
|
||||
@ -1,4 +1,4 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.utils;
|
||||
package com.knecon.fforesight.service.layoutparser.processor.services.tables;
|
||||
|
||||
import java.awt.geom.Point2D;
|
||||
import java.util.Collections;
|
||||
@ -10,6 +10,7 @@ import java.util.Optional;
|
||||
import java.util.TreeMap;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.DoubleComparisons;
|
||||
|
||||
import lombok.experimental.UtilityClass;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
@ -33,7 +34,7 @@ public class RulingIntersectionFinder {
|
||||
*/
|
||||
/*
|
||||
* The algorithm assumes there are only horizontal and vertical lines which are unique in their coordinates. (E.g. no overlapping horizontal lines exist)
|
||||
* As a high level overview, the algorithm uses a sweep line advancing from left to right.
|
||||
* As a high level overview, the algorithm uses a sweep line advancing from lefts to rights.
|
||||
* It dynamically updates the horizontal rulings which are intersected by the current sweep line.
|
||||
* When the sweep line hits a vertical line, it then checks for all intersections with the currently intersected horizontal rulings.
|
||||
* THe trick of the algorithm is using a binary search tree to store the currently intersected horizontal rulings. This way the lookup should be in O(log n).
|
||||
@ -1,4 +1,4 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.utils;
|
||||
package com.knecon.fforesight.service.layoutparser.processor.services.tables;
|
||||
|
||||
import java.awt.geom.Line2D;
|
||||
import java.awt.geom.Point2D;
|
||||
@ -12,7 +12,7 @@ public final class RulingTextDirAdjustUtil {
|
||||
|
||||
/**
|
||||
* Converts a ruling (line of a table) the same way TextPositions are converted in PDFBox.
|
||||
* This will get the y position of the text, adjusted so that 0,0 is upper left and it is adjusted based on the text direction.
|
||||
* This will get the y position of the text, adjusted so that 0,0 is upper lefts and it is adjusted based on the text direction.
|
||||
* <p>
|
||||
* See org.apache.pdfbox.text.TextPosition
|
||||
*/
|
||||
@ -0,0 +1,109 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.services.tables;
|
||||
|
||||
import java.awt.geom.AffineTransform;
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.BoundingBox;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.Cell;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations;
|
||||
|
||||
import lombok.SneakyThrows;
|
||||
import lombok.experimental.UtilityClass;
|
||||
|
||||
@UtilityClass
|
||||
public class TableAreaFiller {
|
||||
|
||||
public Set<Cell> findMissingCells(List<Cell> cells, Rectangle2D areaPDF, AffineTransform pdfToPageTransform) {
|
||||
|
||||
var area = RectangleTransformations.transform(areaPDF, pdfToPageTransform);
|
||||
|
||||
List<Rectangle2D> rectangles = cells.stream()
|
||||
.map(BoundingBox::getBBox)
|
||||
.toList();
|
||||
Set<Rectangle2D> unfilledRects = findMissingRects(rectangles, area);
|
||||
|
||||
AffineTransform pageToPdfTransform = getInverse(pdfToPageTransform);
|
||||
|
||||
return unfilledRects.stream()
|
||||
.map(rect -> Cell.fromPageCoordinates(rect, pageToPdfTransform))
|
||||
.collect(Collectors.toSet());
|
||||
}
|
||||
|
||||
|
||||
public static Set<Rectangle2D> findMissingRects(List<Rectangle2D> rectangles, Rectangle2D area) {
|
||||
|
||||
double minWidth = rectangles.stream()
|
||||
.mapToDouble(Rectangle2D::getWidth)
|
||||
.min().orElse(0) * 0.95;
|
||||
double minHeight = rectangles.stream()
|
||||
.mapToDouble(Rectangle2D::getHeight)
|
||||
.min().orElse(0) * 0.95;
|
||||
|
||||
Set<Rectangle2D> unfilledRects = new HashSet<>();
|
||||
unfilledRects.add(area);
|
||||
for (Rectangle2D rectangle : rectangles) {
|
||||
unfilledRects = fillWithRectangle(unfilledRects, rectangle, minWidth, minHeight);
|
||||
}
|
||||
return unfilledRects;
|
||||
}
|
||||
|
||||
|
||||
private Set<Rectangle2D> fillWithRectangle(Set<Rectangle2D> unfilledRects, Rectangle2D rectToAdd, double minWidth, double minHeight) {
|
||||
|
||||
Set<Rectangle2D> remainingUnfilledRects = new HashSet<>();
|
||||
for (Rectangle2D unfilledRect : unfilledRects) {
|
||||
if (!rectToAdd.intersects(unfilledRect)) {
|
||||
remainingUnfilledRects.add(unfilledRect);
|
||||
continue;
|
||||
}
|
||||
|
||||
boolean topAdded = false;
|
||||
boolean bottomAdded = false;
|
||||
|
||||
// Top rectangle
|
||||
double topHeight = rectToAdd.getY() - unfilledRect.getY();
|
||||
if (topHeight > minHeight) {
|
||||
topAdded = true;
|
||||
Rectangle2D topRect = new Rectangle2D.Double(unfilledRect.getX(), unfilledRect.getY(), unfilledRect.getWidth(), topHeight);
|
||||
remainingUnfilledRects.add(topRect);
|
||||
}
|
||||
// Bottom rectangle
|
||||
double bottomHeight = unfilledRect.getMaxY() - rectToAdd.getMaxY();
|
||||
if (bottomHeight > minHeight) {
|
||||
bottomAdded = true;
|
||||
Rectangle2D bottomRect = new Rectangle2D.Double(unfilledRect.getX(), rectToAdd.getMaxY(), unfilledRect.getWidth(), bottomHeight);
|
||||
remainingUnfilledRects.add(bottomRect);
|
||||
}
|
||||
|
||||
double y = topAdded ? rectToAdd.getY() : unfilledRect.getY();
|
||||
double maxY = bottomAdded ? rectToAdd.getMaxY() : unfilledRect.getMaxY();
|
||||
double height = maxY - y;
|
||||
|
||||
// Left rectangle
|
||||
double leftWidth = rectToAdd.getX() - unfilledRect.getX();
|
||||
if (leftWidth > minWidth) {
|
||||
Rectangle2D leftRect = new Rectangle2D.Double(unfilledRect.getX(), y, leftWidth, height);
|
||||
remainingUnfilledRects.add(leftRect);
|
||||
}
|
||||
// Right rectangle
|
||||
double rightWidth = unfilledRect.getMaxX() - rectToAdd.getMaxX();
|
||||
if (rightWidth > minWidth) {
|
||||
Rectangle2D rightRect = new Rectangle2D.Double(rectToAdd.getMaxX(), y, rightWidth, height);
|
||||
remainingUnfilledRects.add(rightRect);
|
||||
}
|
||||
}
|
||||
return remainingUnfilledRects;
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
private static AffineTransform getInverse(AffineTransform pdfToPageTransform) {
|
||||
|
||||
return pdfToPageTransform.createInverse();
|
||||
}
|
||||
|
||||
}
|
||||
@ -0,0 +1,270 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.services.tables;
|
||||
|
||||
import static com.knecon.fforesight.service.layoutparser.processor.utils.GeometricComparators.CELL_SIZE_COMPARATOR;
|
||||
import static com.knecon.fforesight.service.layoutparser.processor.utils.GeometricComparators.RECTANGLE_SIZE_COMPARATOR;
|
||||
|
||||
import java.awt.Color;
|
||||
import java.awt.geom.AffineTransform;
|
||||
import java.awt.geom.Point2D;
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collection;
|
||||
import java.util.Collections;
|
||||
import java.util.HashSet;
|
||||
import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Objects;
|
||||
import java.util.Set;
|
||||
import java.util.function.Function;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import org.springframework.stereotype.Service;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.BoundingBox;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.docstrum.service.ReadingOrderService;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.PageInformation;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.Cell;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.TablePageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.Word;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.blockification.BlockificationService;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.CoordinateTransforms;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.DoubleComparisons;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.GeometricComparators;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.SpreadsheetFinder;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.visualization.LayoutDebugLayer;
|
||||
import com.knecon.fforesight.service.ocr.v1.api.model.Table;
|
||||
import com.knecon.fforesight.service.ocr.v1.api.model.TableCell;
|
||||
import com.knecon.fforesight.service.ocr.v1.api.model.TableCellType;
|
||||
|
||||
import lombok.AccessLevel;
|
||||
import lombok.RequiredArgsConstructor;
|
||||
import lombok.SneakyThrows;
|
||||
import lombok.experimental.FieldDefaults;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
|
||||
@Slf4j
|
||||
@Service
|
||||
@RequiredArgsConstructor
|
||||
@FieldDefaults(level = AccessLevel.PRIVATE, makeFinal = true)
|
||||
public class TableExtractionService {
|
||||
|
||||
public static final int MAX_ROWS_OR_COLS = 500;
|
||||
public static final int MAX_CELLS = MAX_ROWS_OR_COLS * MAX_ROWS_OR_COLS;
|
||||
BlockificationService blockificationService;
|
||||
ReadingOrderService readingOrderService;
|
||||
static int MIN_TABLE_CONTAINED_CELLS_WITH_TEXT = 1;
|
||||
static double TABLE_UNIFORMITY_THRESHOLD = 0.7;
|
||||
|
||||
|
||||
public List<TablePageBlock> extractTables(List<Cell> emptyCells,
|
||||
List<Word> words,
|
||||
PageInformation pageInformation,
|
||||
List<Table> idpTables,
|
||||
LayoutParsingType layoutParsingType,
|
||||
LayoutDebugLayer layoutDebugLayer) {
|
||||
|
||||
AffineTransform pdfToPageTransform = CoordinateTransforms.calculateInitialUserSpaceCoordsToPageCoords(pageInformation);
|
||||
List<TablePageBlock> tablePageBlocks;
|
||||
if (idpTables == null || idpTables.isEmpty()) {
|
||||
tablePageBlocks = extractTables(emptyCells, words, pdfToPageTransform, layoutParsingType, layoutDebugLayer, pageInformation);
|
||||
} else {
|
||||
tablePageBlocks = buildTableFromIdpResult(idpTables, words, pdfToPageTransform, layoutParsingType);
|
||||
}
|
||||
return tablePageBlocks;
|
||||
}
|
||||
|
||||
|
||||
private List<TablePageBlock> extractTables(List<Cell> emptyCells,
|
||||
List<Word> words,
|
||||
AffineTransform pdfToPageTransform,
|
||||
LayoutParsingType layoutParsingType,
|
||||
LayoutDebugLayer layoutDebugLayer,
|
||||
PageInformation pageInformation) {
|
||||
|
||||
// sort cells by size (height * width) ascending so that textBlocks are always assigned to the smallest cells that contain them
|
||||
emptyCells.sort(CELL_SIZE_COMPARATOR);
|
||||
|
||||
List<Cell> cells = new ArrayList<>(new HashSet<>(emptyCells));
|
||||
DoubleComparisons.sort(cells, GeometricComparators.CELL_SORTER);
|
||||
|
||||
List<Rectangle2D> spreadsheetAreas = SpreadsheetFinder.findSpreadsheetsFromCells(cells);
|
||||
// sort spreadsheetAreas by size (height * width) ascending so that cells are placed in the smallest tables first
|
||||
// this way no cell duplication occurs when tables are contained in other tables and only the most inner table contains the cells
|
||||
spreadsheetAreas.sort(RECTANGLE_SIZE_COMPARATOR);
|
||||
|
||||
List<TablePageBlock> tables = new ArrayList<>();
|
||||
for (Rectangle2D area : spreadsheetAreas) {
|
||||
|
||||
List<Cell> containedCells = new ArrayList<>();
|
||||
for (Cell cell : cells) {
|
||||
if (cell.hasMinimumSize() && area.contains(cell.getBBoxPdf())) {
|
||||
containedCells.add(cell);
|
||||
}
|
||||
}
|
||||
|
||||
if (containedCells.isEmpty()) {
|
||||
continue;
|
||||
}
|
||||
// if cells are missing, for example a corner hasn't been recognized (See files/syngenta/CustomerFiles/SinglePages/T4_Page16_138 IDD0000261736.pdf),
|
||||
// the LinkedCell based gridification can deal with this, but the transpose logic will then drop the entire column.
|
||||
// That's why we compute the missing Cells from the spreadsheet area and fill them in.
|
||||
Set<Cell> missingCells = TableAreaFiller.findMissingCells(containedCells, area, pdfToPageTransform);
|
||||
|
||||
layoutDebugLayer.addCellVisualizations(missingCells, pageInformation.number(), Color.RED);
|
||||
layoutDebugLayer.addCellVisualizations(List.of(new Cell(area, pdfToPageTransform)), pageInformation.number(), Color.BLUE);
|
||||
|
||||
containedCells.addAll(missingCells);
|
||||
|
||||
Set<Word> wordsInTable = new HashSet<>(); // As docstrum blockfication recomputes the words, we need to remember the origin words to remove them from the overall list of words
|
||||
for (Cell cell : containedCells) {
|
||||
Function<Point2D, Boolean> contains = p -> cell.getBBoxPdf().contains(p);
|
||||
Function<Rectangle2D, Boolean> containsRect = r -> cell.getBBoxPdf().contains(r);
|
||||
BlocksWithTheirWords blocksWithTheirWords = sortBlocksIntoCell(layoutParsingType, words, tables, contains, containsRect);
|
||||
cell.setTextBlocks(blocksWithTheirWords.blocks());
|
||||
wordsInTable.addAll(blocksWithTheirWords.words());
|
||||
}
|
||||
|
||||
if (containedCells.size() > MAX_CELLS) {
|
||||
continue;
|
||||
}
|
||||
|
||||
var containedCellsWithText = containedCells.stream()
|
||||
.filter(cell -> !cell.getTextBlocks().isEmpty())
|
||||
.toList();
|
||||
|
||||
// verify if table would contain fewer cells with text than the threshold allows
|
||||
if (containedCellsWithText.size() >= MIN_TABLE_CONTAINED_CELLS_WITH_TEXT && checkIfTableCellsAreUniform(containedCells)) {
|
||||
|
||||
TablePageBlock tablePageBlock = new TableFromCellsExtractor(containedCells, pdfToPageTransform).extract();
|
||||
cells.removeAll(containedCells);
|
||||
addTableIfValid(words, tablePageBlock, tables, wordsInTable);
|
||||
}
|
||||
}
|
||||
|
||||
return tables;
|
||||
}
|
||||
|
||||
|
||||
private static void removeWordsFromCells(List<Word> words, TablePageBlock tablePageBlock) {
|
||||
|
||||
Set<Word> wordsFromCells = new HashSet<>(tablePageBlock.getWords());
|
||||
words.removeAll(wordsFromCells);
|
||||
}
|
||||
|
||||
|
||||
private List<TablePageBlock> buildTableFromIdpResult(List<Table> idpTables, List<Word> words, AffineTransform pdfToPageTransform, LayoutParsingType layoutParsingType) {
|
||||
|
||||
if (idpTables == null || idpTables.isEmpty()) {
|
||||
return Collections.emptyList();
|
||||
}
|
||||
List<TablePageBlock> tables = new ArrayList<>();
|
||||
for (Table idpTable : idpTables) {
|
||||
if (idpTable.bboxes().size() != 1) {
|
||||
// Should never happen, as IDP still looks at pages individually. (I think so, at least 😅)
|
||||
log.error("IDP Table on multiple pages are not handled yet!");
|
||||
continue;
|
||||
}
|
||||
|
||||
List<Cell> cells = new ArrayList<>(idpTable.cells().size());
|
||||
Set<Word> wordsInTable = new HashSet<>(); // As docstrum blockfication recomputes the words, we need to remember the origin words to remove them from the overall list of words
|
||||
for (TableCell idpCell : idpTable.cells()) {
|
||||
Cell cell = new Cell(idpCell, pdfToPageTransform);
|
||||
if (idpCell.kind().equals(TableCellType.ROW_HEADER) || idpCell.kind().equals(TableCellType.COLUMN_HEADER)) {
|
||||
cell.setHeaderCell(true);
|
||||
}
|
||||
cells.add(cell);
|
||||
Function<Point2D, Boolean> contains = p -> idpCell.textRegion().region().bbox().get().contains(p);
|
||||
Function<Rectangle2D, Boolean> containsRect = r -> idpCell.textRegion().region().bbox().get().contains(r);
|
||||
BlocksWithTheirWords blocksWithTheirWords = sortBlocksIntoCell(layoutParsingType, words, tables, contains, containsRect);
|
||||
cell.setTextBlocks(blocksWithTheirWords.blocks);
|
||||
wordsInTable.addAll(blocksWithTheirWords.words());
|
||||
}
|
||||
|
||||
TableGridStructureCalculator calculator = new TableGridStructureCalculator(cells, pdfToPageTransform);
|
||||
List<List<Cell>> gridCells = calculator.gridify();
|
||||
TablePageBlock tablePageBlock = new TablePageBlock(null, gridCells);
|
||||
addTableIfValid(words, tablePageBlock, tables, wordsInTable);
|
||||
}
|
||||
return tables;
|
||||
}
|
||||
|
||||
|
||||
private static void addTableIfValid(List<Word> words, TablePageBlock tablePageBlock, List<TablePageBlock> tables, Set<Word> wordsInTable) {
|
||||
|
||||
if (tablePageBlock.getRowCount() > MAX_ROWS_OR_COLS || tablePageBlock.getColCount() == 0 || tablePageBlock.getColCount() > MAX_ROWS_OR_COLS) {
|
||||
return;
|
||||
}
|
||||
words.removeAll(wordsInTable);
|
||||
tables.add(tablePageBlock);
|
||||
}
|
||||
|
||||
|
||||
private BlocksWithTheirWords sortBlocksIntoCell(LayoutParsingType layoutParsingType,
|
||||
List<Word> words,
|
||||
List<TablePageBlock> tables,
|
||||
Function<Point2D, Boolean> contains,
|
||||
Function<Rectangle2D, Boolean> containsRect) {
|
||||
|
||||
List<Word> wordsInCell = new LinkedList<>();
|
||||
for (Word word : words) {
|
||||
Rectangle2D bBoxPdf = word.getBBoxPdf();
|
||||
if (!contains.apply(new Point2D.Double(bBoxPdf.getCenterX(), bBoxPdf.getCenterY()))) {
|
||||
continue;
|
||||
}
|
||||
wordsInCell.add(word);
|
||||
}
|
||||
List<TextPageBlock> textBlocks = blockificationService.blockify(layoutParsingType, wordsInCell, CleanRulings.empty(), null);
|
||||
List<TablePageBlock> tablesInCell = new LinkedList<>();
|
||||
for (TablePageBlock table : tables) {
|
||||
if (containsRect.apply(table.getBBoxPdf())) {
|
||||
tablesInCell.add(table);
|
||||
}
|
||||
}
|
||||
var blocks = readingOrderService.resolve(textBlocks, tablesInCell);
|
||||
return new BlocksWithTheirWords(blocks, wordsInCell);
|
||||
}
|
||||
|
||||
|
||||
private boolean checkIfTableCellsAreUniform(List<Cell> containedCells) {
|
||||
|
||||
if (containedCells.size() <= 2) {
|
||||
return true;
|
||||
}
|
||||
|
||||
Map<Long, List<Long>> cellsGroupedByRoundedWidth = containedCells.stream()
|
||||
.map(BoundingBox::getWidth)
|
||||
.map(size -> Math.round(size / 10.0) * 10)
|
||||
.collect(Collectors.groupingBy(Long::longValue));
|
||||
|
||||
return (double) cellsGroupedByRoundedWidth.size() / containedCells.size() <= TABLE_UNIFORMITY_THRESHOLD;
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
public static List<Cell> findCells(List<Ruling> horizontalRulingLines, List<Ruling> verticalRulingLines, PageInformation pageInformation) {
|
||||
|
||||
var solidHorizontalRulingLines = horizontalRulingLines.stream()
|
||||
.filter(r -> !Objects.equals(Ruling.Style.DASHED, r.getStyle()))
|
||||
.toList();
|
||||
var solidVerticalRulingLines = verticalRulingLines.stream()
|
||||
.filter(r -> !Objects.equals(Ruling.Style.DASHED, r.getStyle()))
|
||||
.toList();
|
||||
AffineTransform affineTransform = CoordinateTransforms.calculateInitialUserSpaceCoordsToPageCoords(pageInformation);
|
||||
return RectangularIntersectionFinder.find(solidHorizontalRulingLines, solidVerticalRulingLines)
|
||||
.stream()
|
||||
.map(rect -> new Cell(rect, affineTransform))
|
||||
.collect(Collectors.toList());
|
||||
}
|
||||
|
||||
|
||||
private record BlocksWithTheirWords(List<AbstractPageBlock> blocks, Collection<Word> words) {
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
@ -0,0 +1,133 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.services.tables;
|
||||
|
||||
import java.awt.geom.AffineTransform;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collection;
|
||||
import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
|
||||
import com.fasterxml.jackson.annotation.JsonIgnore;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.PageBlockType;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.Cell;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.TablePageBlock;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
|
||||
|
||||
import lombok.Getter;
|
||||
import lombok.Setter;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
|
||||
@Slf4j
|
||||
public class TableFromCellsExtractor {
|
||||
|
||||
@JsonIgnore
|
||||
protected PageBlockType classification;
|
||||
private List<List<Cell>> rows;
|
||||
@Getter
|
||||
@Setter
|
||||
private final List<Cell> originCells;
|
||||
private final AffineTransform pdfToPageTransform;
|
||||
|
||||
|
||||
public TableFromCellsExtractor(List<Cell> originCells, AffineTransform pdfToPageTransform) {
|
||||
|
||||
classification = PageBlockType.TABLE;
|
||||
this.originCells = originCells;
|
||||
this.pdfToPageTransform = pdfToPageTransform;
|
||||
}
|
||||
|
||||
|
||||
public TablePageBlock extract() {
|
||||
|
||||
computeRows(originCells);
|
||||
|
||||
computeHeaders();
|
||||
|
||||
return new TablePageBlock(null, rows);
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Detect header cells (either first row or first column):
|
||||
* Column is marked as header if originalCell text is bold and row originalCell text is not bold.
|
||||
* Defaults to row.
|
||||
*/
|
||||
private void computeHeaders() {
|
||||
|
||||
// A bold originalCell is a header originalCell as long as every originalCell to the lefts/top is bold, too
|
||||
// we move from lefts to rights and top to bottom
|
||||
for (int rowIndex = 0; rowIndex < rows.size(); rowIndex++) {
|
||||
List<Cell> rowCells = rows.get(rowIndex);
|
||||
if (rowCells.size() == 1) {
|
||||
continue;
|
||||
}
|
||||
|
||||
for (int colIndex = 0; colIndex < rowCells.size(); colIndex++) {
|
||||
Cell cell = rowCells.get(colIndex);
|
||||
List<Cell> cellsToTheLeft = rowCells.subList(0, colIndex);
|
||||
Cell lastHeaderCell = null;
|
||||
for (Cell leftCell : cellsToTheLeft) {
|
||||
if (leftCell.isHeaderCell()) {
|
||||
lastHeaderCell = leftCell;
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (lastHeaderCell != null) {
|
||||
cell.getHeaderCells().add(lastHeaderCell);
|
||||
}
|
||||
List<Cell> cellsToTheTop = new ArrayList<>();
|
||||
for (int i = 0; i < rowIndex; i++) {
|
||||
try {
|
||||
cellsToTheTop.add(rows.get(i).get(colIndex));
|
||||
} catch (IndexOutOfBoundsException e) {
|
||||
log.debug("No cell {} in row {}, ignoring.", colIndex, rowIndex);
|
||||
}
|
||||
}
|
||||
for (Cell topCell : cellsToTheTop) {
|
||||
if (topCell.isHeaderCell()) {
|
||||
lastHeaderCell = topCell;
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (lastHeaderCell != null) {
|
||||
cell.getHeaderCells().add(lastHeaderCell);
|
||||
}
|
||||
if (!cell.getTextBlocks().isEmpty() //
|
||||
&& cell.getTextBlocks().get(0) instanceof TextPageBlock textPageBlock //
|
||||
&& textPageBlock.getMostPopularWordStyle().equals("bold")) {
|
||||
cell.setHeaderCell(true);
|
||||
}
|
||||
}
|
||||
}
|
||||
setFirstRowAsHeaderIfNoneFound(rows);
|
||||
}
|
||||
|
||||
|
||||
private void setFirstRowAsHeaderIfNoneFound(List<List<Cell>> rows) {
|
||||
|
||||
if (rows.isEmpty()) {
|
||||
return;
|
||||
}
|
||||
|
||||
if (rows.stream()
|
||||
.flatMap(Collection::stream)
|
||||
.noneMatch(Cell::isHeaderCell)) {
|
||||
rows.get(0)
|
||||
.forEach(cell -> cell.setHeaderCell(true));
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
private void computeRows(List<Cell> cells) {
|
||||
|
||||
if (cells.isEmpty()) {
|
||||
return;
|
||||
}
|
||||
|
||||
TableGridStructureCalculator calculator = new TableGridStructureCalculator(cells, pdfToPageTransform);
|
||||
rows = calculator.gridify();
|
||||
}
|
||||
|
||||
}
|
||||
@ -0,0 +1,353 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.services.tables;
|
||||
|
||||
import java.awt.geom.AffineTransform;
|
||||
import java.awt.geom.Point2D;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collection;
|
||||
import java.util.Collections;
|
||||
import java.util.HashSet;
|
||||
import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.Cell;
|
||||
|
||||
import lombok.SneakyThrows;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
|
||||
@Slf4j
|
||||
public class TableGridStructureCalculator {
|
||||
|
||||
// multiplied with minimum cell height/width, Cells may be at most this apart in one dimension, and must overlap at least that much in the other dimension to be considered neighbours
|
||||
private static final double DISTANCE_FACTOR = 0.5;
|
||||
Set<Cell> cells;
|
||||
AffineTransform pageToPdfTransform;
|
||||
double minCellHeight;
|
||||
double minCellWidth;
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
TableGridStructureCalculator(Collection<Cell> cells, AffineTransform pdfToPageTransform) {
|
||||
|
||||
this.cells = new HashSet<>(cells);
|
||||
this.pageToPdfTransform = pdfToPageTransform.createInverse();
|
||||
this.minCellHeight = cells.stream()
|
||||
.mapToDouble(cell -> cell.getBBox().getHeight())
|
||||
.min().orElse(0);
|
||||
this.minCellWidth = cells.stream()
|
||||
.mapToDouble(cell -> cell.getBBox().getWidth())
|
||||
.min().orElse(0);
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Calculates the grid structure of the table. For spanning rows and columns multiple cells with the same values will be inserted.
|
||||
* Checks if any cell has more than one neighbor in any direction, if it does, it splits the cell according to its neighbors.
|
||||
* This is repeated until no more splits are necessary. Then the rows are computed using that very same linked neighbor structure starting with the top left cell.
|
||||
*
|
||||
* @return TablePageBlock Structure as a rows of cells matrix
|
||||
*/
|
||||
public List<List<Cell>> gridify() {
|
||||
|
||||
if (cellsHaveLargeOverlaps()) {
|
||||
// If cells overlap significantly, the logic below will keep splitting them infinitely, so we revert to the simpler area sweep implementation.
|
||||
List<List<Cell>> rows = AreaSweepGridifier.gridify(cells, pageToPdfTransform, minCellWidth, minCellHeight);
|
||||
rows = removeEmptyRows(rows);
|
||||
rows = removeEmptyCols(rows);
|
||||
return rows;
|
||||
}
|
||||
|
||||
var linkedCells = cells.stream()
|
||||
.map(LinkedCell::new)
|
||||
.collect(Collectors.toList());
|
||||
|
||||
computeNeighbours(linkedCells);
|
||||
|
||||
while (linkedCells.stream()
|
||||
.anyMatch(LinkedCell::needsSplit)) {
|
||||
|
||||
List<LinkedCell> newCells = new LinkedList<>();
|
||||
for (LinkedCell linkedCell : linkedCells) {
|
||||
if (linkedCell.needsSplit()) {
|
||||
newCells.addAll(linkedCell.split());
|
||||
} else {
|
||||
newCells.add(linkedCell);
|
||||
}
|
||||
}
|
||||
computeNeighbours(newCells);
|
||||
linkedCells = newCells;
|
||||
}
|
||||
return buildStructure(linkedCells);
|
||||
}
|
||||
|
||||
|
||||
private boolean cellsHaveLargeOverlaps() {
|
||||
|
||||
for (Cell cell1 : cells) {
|
||||
for (Cell cell2 : cells) {
|
||||
if (cell1.equals(cell2)) {
|
||||
continue;
|
||||
}
|
||||
if (cell1.horizontalOverlap(cell2) > minCellWidth * DISTANCE_FACTOR //
|
||||
&& cell1.verticalOverlap(cell2) > minCellHeight * DISTANCE_FACTOR) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
private List<List<Cell>> buildStructure(List<LinkedCell> cells) {
|
||||
|
||||
if (cells.isEmpty()) {
|
||||
return Collections.emptyList();
|
||||
}
|
||||
List<List<Cell>> rows = buildRows(cells);
|
||||
if (isNotRectangular(rows)) {
|
||||
throw new AssertionError();
|
||||
}
|
||||
rows = removeEmptyRows(rows);
|
||||
rows = removeEmptyCols(rows);
|
||||
return rows;
|
||||
}
|
||||
|
||||
|
||||
private boolean isNotRectangular(List<List<Cell>> rows) {
|
||||
|
||||
if (rows.isEmpty()) {
|
||||
return true;
|
||||
}
|
||||
int n = rows.get(0).size();
|
||||
return rows.stream()
|
||||
.anyMatch(row -> row.size() != n);
|
||||
}
|
||||
|
||||
|
||||
private List<List<Cell>> buildRows(List<LinkedCell> cells) {
|
||||
|
||||
List<LinkedCell> topLeftCandidates = cells.stream()
|
||||
.filter(LinkedCell::isTopLeft)
|
||||
.toList();
|
||||
|
||||
assert topLeftCandidates.size() == 1;
|
||||
var cell = topLeftCandidates.get(0);
|
||||
|
||||
List<List<Cell>> rows = new ArrayList<>();
|
||||
rows.add(buildRow(cell));
|
||||
while (!cell.belows.isEmpty()) {
|
||||
cell = cell.belows.get(0);
|
||||
rows.add(buildRow(cell));
|
||||
}
|
||||
if (isNotRectangular(rows)) {
|
||||
throw new AssertionError();
|
||||
}
|
||||
return rows;
|
||||
}
|
||||
|
||||
|
||||
private static List<Cell> buildRow(LinkedCell cell) {
|
||||
|
||||
List<Cell> currentRow = new ArrayList<>();
|
||||
LinkedCell nextCell = cell;
|
||||
currentRow.add(cell.originalCell);
|
||||
while (!nextCell.rights.isEmpty()) {
|
||||
nextCell = nextCell.rights.get(0);
|
||||
currentRow.add(nextCell.originalCell);
|
||||
}
|
||||
return currentRow;
|
||||
}
|
||||
|
||||
|
||||
private void computeNeighbours(List<LinkedCell> cells) {
|
||||
|
||||
for (LinkedCell cell : cells) {
|
||||
cell.resetNeighbours();
|
||||
computeNeighbours(cell, cells);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
private void computeNeighbours(LinkedCell cell, List<LinkedCell> otherCells) {
|
||||
|
||||
for (LinkedCell otherCell : otherCells) {
|
||||
if (cell.equals(otherCell)) {
|
||||
continue;
|
||||
}
|
||||
if (cell.originalCell.horizontalDistance(otherCell.originalCell) <= minCellWidth * DISTANCE_FACTOR
|
||||
&& cell.originalCell.verticalOverlap(otherCell.originalCell) >= minCellHeight * DISTANCE_FACTOR) {
|
||||
if (cell.originalCell.getBBox().getCenterX() <= otherCell.originalCell.getBBox().getCenterX()) {
|
||||
cell.rights.add(otherCell);
|
||||
} else {
|
||||
cell.lefts.add(otherCell);
|
||||
}
|
||||
} else if (cell.originalCell.verticalDistance(otherCell.originalCell) <= minCellHeight * DISTANCE_FACTOR
|
||||
&& cell.originalCell.horizontalOverlap(otherCell.originalCell) >= minCellWidth * DISTANCE_FACTOR) {
|
||||
if (cell.originalCell.getBBox().getCenterY() <= otherCell.originalCell.getBBox().getCenterY()) {
|
||||
cell.belows.add(otherCell);
|
||||
} else {
|
||||
cell.aboves.add(otherCell);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
static <T> List<List<T>> transpose(List<List<T>> table) {
|
||||
|
||||
List<List<T>> ret = new ArrayList<List<T>>();
|
||||
final int N = table.get(0).size();
|
||||
for (int i = 0; i < N; i++) {
|
||||
List<T> col = new ArrayList<T>();
|
||||
for (List<T> row : table) {
|
||||
col.add(row.get(i));
|
||||
}
|
||||
ret.add(col);
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
|
||||
private List<List<Cell>> removeEmptyCols(List<List<Cell>> rowsOfCells) {
|
||||
|
||||
if (rowsOfCells.isEmpty()) {
|
||||
return rowsOfCells;
|
||||
}
|
||||
|
||||
var colsOfCells = transpose(rowsOfCells);
|
||||
colsOfCells = removeEmptyRows(colsOfCells);
|
||||
return transpose(colsOfCells);
|
||||
}
|
||||
|
||||
|
||||
private List<List<Cell>> removeEmptyRows(List<List<Cell>> rowsOfCells) {
|
||||
|
||||
return rowsOfCells.stream()
|
||||
.filter(row -> row.stream()
|
||||
.anyMatch(cell -> !cell.getTextBlocks().isEmpty()))
|
||||
.collect(Collectors.toList());
|
||||
}
|
||||
|
||||
|
||||
class LinkedCell {
|
||||
|
||||
private final Cell originalCell;
|
||||
private final List<LinkedCell> rights;
|
||||
private final List<LinkedCell> lefts;
|
||||
private final List<LinkedCell> aboves;
|
||||
private final List<LinkedCell> belows;
|
||||
|
||||
|
||||
LinkedCell(Cell cell) {
|
||||
|
||||
this.originalCell = cell;
|
||||
this.rights = new LinkedList<>();
|
||||
this.lefts = new LinkedList<>();
|
||||
this.aboves = new LinkedList<>();
|
||||
this.belows = new LinkedList<>();
|
||||
}
|
||||
|
||||
|
||||
public boolean needsSplit() {
|
||||
|
||||
return rights.size() > 1 || lefts.size() > 1 || aboves.size() > 1 || belows.size() > 1;
|
||||
}
|
||||
|
||||
|
||||
public boolean isTopLeft() {
|
||||
|
||||
return lefts.isEmpty() && aboves.isEmpty();
|
||||
}
|
||||
|
||||
|
||||
public String toString() {
|
||||
|
||||
return originalCell.toString();
|
||||
}
|
||||
|
||||
|
||||
public Collection<LinkedCell> split() {
|
||||
|
||||
if (rights.size() > 1 && rights.size() >= lefts.size()) {
|
||||
return splitY(rights);
|
||||
}
|
||||
if (lefts.size() > 1) {
|
||||
return splitY(lefts);
|
||||
}
|
||||
if (aboves.size() > 1 && aboves.size() >= belows.size()) {
|
||||
return splitX(aboves);
|
||||
}
|
||||
if (belows.size() > 1) {
|
||||
return splitX(belows);
|
||||
}
|
||||
return List.of(this);
|
||||
}
|
||||
|
||||
|
||||
private List<LinkedCell> splitY(List<LinkedCell> neighbours) {
|
||||
|
||||
List<LinkedCell> splitCells = new LinkedList<>();
|
||||
List<Double> ySplit = neighbours.stream()
|
||||
.map(right -> right.originalCell.getMaxY())
|
||||
.sorted()
|
||||
.toList();
|
||||
Point2D topLeft = new Point2D.Double(originalCell.getBBox().getMinX(), originalCell.getBBox().getMinY());
|
||||
double maxX = originalCell.getBBox().getMaxX();
|
||||
double x = originalCell.getBBox().getX();
|
||||
double maxY = originalCell.getBBox().getMaxY();
|
||||
for (Double neighborY : ySplit) {
|
||||
double y = Math.min(neighborY, maxY);
|
||||
Point2D bottomRight = new Point2D.Double(maxX, y);
|
||||
Cell cell = copyCell(topLeft, bottomRight);
|
||||
splitCells.add(new LinkedCell(cell));
|
||||
topLeft = new Point2D.Double(x, y);
|
||||
}
|
||||
return splitCells;
|
||||
}
|
||||
|
||||
|
||||
private List<LinkedCell> splitX(List<LinkedCell> neighbours) {
|
||||
|
||||
List<LinkedCell> splitCells = new LinkedList<>();
|
||||
List<Double> xSplit = neighbours.stream()
|
||||
.map(right -> right.originalCell.getMaxX())
|
||||
.sorted()
|
||||
.toList();
|
||||
Point2D topLeft = new Point2D.Double(originalCell.getBBox().getMinX(), originalCell.getBBox().getMinY());
|
||||
double maxY = originalCell.getBBox().getMaxY();
|
||||
double y = originalCell.getBBox().getY();
|
||||
double maxX = originalCell.getBBox().getMaxX();
|
||||
for (Double neighborX : xSplit) {
|
||||
double x = Math.min(neighborX, maxX);
|
||||
Point2D bottomRight = new Point2D.Double(x, maxY);
|
||||
Cell cell = copyCell(topLeft, bottomRight);
|
||||
splitCells.add(new LinkedCell(cell));
|
||||
topLeft = new Point2D.Double(x, y);
|
||||
}
|
||||
return splitCells;
|
||||
}
|
||||
|
||||
|
||||
private Cell copyCell(Point2D topLeft, Point2D bottomRight) {
|
||||
|
||||
Cell cell = Cell.fromPageCoordinates(topLeft, bottomRight, pageToPdfTransform);
|
||||
cell.setHeaderCell(originalCell.isHeaderCell());
|
||||
cell.setTextBlocks(originalCell.getTextBlocks());
|
||||
return cell;
|
||||
}
|
||||
|
||||
|
||||
public void resetNeighbours() {
|
||||
|
||||
rights.clear();
|
||||
lefts.clear();
|
||||
aboves.clear();
|
||||
belows.clear();
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
@ -1,4 +1,4 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.utils;
|
||||
package com.knecon.fforesight.service.layoutparser.processor.services.tables;
|
||||
|
||||
import java.util.Collection;
|
||||
import java.util.Collections;
|
||||
@ -0,0 +1,113 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.services.visualization;
|
||||
|
||||
import java.awt.Color;
|
||||
import java.util.Objects;
|
||||
|
||||
import com.knecon.fforesight.service.ocr.v1.api.model.Figure;
|
||||
import com.knecon.fforesight.service.ocr.v1.api.model.IdpResult;
|
||||
import com.knecon.fforesight.service.ocr.v1.api.model.KeyValuePair;
|
||||
import com.knecon.fforesight.service.ocr.v1.api.model.QuadPoint;
|
||||
import com.knecon.fforesight.service.ocr.v1.api.model.Region;
|
||||
import com.knecon.fforesight.service.ocr.v1.api.model.Table;
|
||||
import com.knecon.fforesight.service.ocr.v1.api.model.TableCell;
|
||||
import com.knecon.fforesight.service.ocr.v1.api.model.TableCellType;
|
||||
import com.knecon.fforesight.service.ocr.v1.api.model.TextRegion;
|
||||
import com.knecon.fforesight.service.viewerdoc.layers.IdpLayerConfig;
|
||||
import com.knecon.fforesight.service.viewerdoc.model.ColoredLine;
|
||||
import com.knecon.fforesight.service.viewerdoc.model.FilledRectangle;
|
||||
import com.knecon.fforesight.service.viewerdoc.model.Visualizations;
|
||||
|
||||
import lombok.AccessLevel;
|
||||
import lombok.experimental.FieldDefaults;
|
||||
|
||||
@FieldDefaults(level = AccessLevel.PRIVATE, makeFinal = true)
|
||||
public class IdpResultLayer extends IdpLayerConfig {
|
||||
|
||||
public static final int LINE_WIDTH = 1;
|
||||
|
||||
|
||||
public IdpResultLayer(IdpResult result) {
|
||||
|
||||
result.tables()
|
||||
.forEach(this::addTable);
|
||||
result.keyValuePairs()
|
||||
.forEach(this::addKeyValue);
|
||||
result.figures()
|
||||
.forEach(this::addFigure);
|
||||
}
|
||||
|
||||
|
||||
private void addFigure(Figure figure) {
|
||||
|
||||
addRegion(figure.image(), figures, IMAGE_COLOR);
|
||||
if (figure.caption() != null) {
|
||||
addRegion(figure.caption().region(), figures, IMAGE_COLOR);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
private void addTable(Table table) {
|
||||
|
||||
for (Region bbox : table.bboxes()) {
|
||||
addRegion(bbox, tables, TABLE_COLOR);
|
||||
}
|
||||
for (TableCell cell : table.cells()) {
|
||||
addRegion(cell.textRegion().region(), tables, INNER_LINES_COLOR);
|
||||
if (Objects.equals(cell.kind(), TableCellType.ROW_HEADER) || Objects.equals(cell.kind(), TableCellType.COLUMN_HEADER)) {
|
||||
addRegionAsFilledRect(cell.textRegion().region(), tables, HEADER_CELL_COLOR);
|
||||
}
|
||||
}
|
||||
if (table.caption() != null) {
|
||||
addRegion(table.caption().region(), tables, TABLE_COLOR);
|
||||
}
|
||||
for (TextRegion footnote : table.footnotes()) {
|
||||
addRegion(footnote.region(), tables, FOOTNOTE_COLOR);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
private void addQuadPoint(int pageNumber, QuadPoint bbox, Visualizations vis, Color color) {
|
||||
|
||||
var visOnPage = getOrCreateVisualizationsOnPage(pageNumber, vis);
|
||||
bbox.asLines()
|
||||
.forEach(line -> visOnPage.getColoredLines().add(new ColoredLine(line, color, LINE_WIDTH)));
|
||||
}
|
||||
|
||||
|
||||
private void addRegion(Region region, Visualizations vis, Color color) {
|
||||
|
||||
var sectionsOnPage = getOrCreateVisualizationsOnPage(region.pageNumber(), vis);
|
||||
region.bbox().get().asLines()
|
||||
.forEach(line -> sectionsOnPage.getColoredLines().add(new ColoredLine(line, color, LINE_WIDTH)));
|
||||
}
|
||||
|
||||
|
||||
private void addRegionAsFilledRect(Region region, Visualizations vis, Color color) {
|
||||
|
||||
var sectionsOnPage = getOrCreateVisualizationsOnPage(region.pageNumber(), vis);
|
||||
sectionsOnPage.getFilledRectangles().add(new FilledRectangle(region.bbox().get().getBounds2D(), color, 0.2f));
|
||||
}
|
||||
|
||||
|
||||
public void addKeyValue(KeyValuePair keyValue) {
|
||||
|
||||
if (keyValue.key() != null) {
|
||||
addRegion(keyValue.key().region(), keyValuePairs, KEY_COLOR);
|
||||
}
|
||||
if (keyValue.value() != null) {
|
||||
addRegion(keyValue.value().region(), keyValuePairs, VALUE_COLOR);
|
||||
}
|
||||
if (keyValue.key() != null && keyValue.value() != null) {
|
||||
QuadPoint key = keyValue.key().region().bbox().get();
|
||||
QuadPoint value = keyValue.value().region().bbox().get();
|
||||
|
||||
var line = LineUtils.findClosestMidpointLine(key, value);
|
||||
var arrowHead = LineUtils.createArrowHead(line, Math.min(LineUtils.length(line), 5));
|
||||
var linesOnPage = getOrCreateVisualizationsOnPage(keyValue.key().region().pageNumber(), keyValuePairs).getColoredLines();
|
||||
linesOnPage.add(new ColoredLine(line, KEY_VALUE_BBOX_COLOR, LINE_WIDTH));
|
||||
linesOnPage.add(new ColoredLine(arrowHead[0], KEY_VALUE_BBOX_COLOR, LINE_WIDTH));
|
||||
linesOnPage.add(new ColoredLine(arrowHead[1], KEY_VALUE_BBOX_COLOR, LINE_WIDTH));
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
@ -1,6 +1,7 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.services.visualization;
|
||||
|
||||
import java.io.File;
|
||||
import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
|
||||
import org.springframework.stereotype.Service;
|
||||
@ -14,6 +15,7 @@ import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsi
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.DocumentWithVisualization;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.mapper.OutlineMapper;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.visualization.LayoutGrid;
|
||||
import com.knecon.fforesight.service.viewerdoc.layers.LayerGroup;
|
||||
import com.knecon.fforesight.service.viewerdoc.model.Outline;
|
||||
import com.knecon.fforesight.service.viewerdoc.service.PDFTronViewerDocumentService;
|
||||
|
||||
@ -48,16 +50,15 @@ public class LayoutGridService {
|
||||
document.layoutDebugLayer().addSentenceVisualization(document.document().getTextBlock());
|
||||
document.layoutDebugLayer().addOutlineHeadlines(document.document());
|
||||
|
||||
List<LayerGroup> layers = new LinkedList<>();
|
||||
layers.add(layoutGrid);
|
||||
if (document.layoutDebugLayer().isActive()) {
|
||||
viewerDocumentService.addLayerGroups(originFile,
|
||||
destinationFile,
|
||||
List.of(layoutGrid, document.layoutDebugLayer()),
|
||||
layoutParserVersion,
|
||||
layoutParsingTypeName,
|
||||
outline);
|
||||
} else {
|
||||
viewerDocumentService.addLayerGroups(originFile, destinationFile, List.of(layoutGrid), layoutParserVersion, layoutParsingTypeName, outline);
|
||||
layers.add(document.layoutDebugLayer());
|
||||
|
||||
}
|
||||
|
||||
viewerDocumentService.addLayerGroups(originFile, destinationFile, layers, layoutParserVersion, layoutParsingTypeName, outline);
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
||||
@ -0,0 +1,125 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.services.visualization;
|
||||
|
||||
import java.awt.Color;
|
||||
import java.awt.geom.AffineTransform;
|
||||
import java.awt.geom.Line2D;
|
||||
import java.awt.geom.Point2D;
|
||||
import java.util.List;
|
||||
|
||||
import com.knecon.fforesight.service.ocr.v1.api.model.QuadPoint;
|
||||
import com.knecon.fforesight.service.viewerdoc.model.ColoredLine;
|
||||
|
||||
import lombok.experimental.UtilityClass;
|
||||
|
||||
@UtilityClass
|
||||
public class LineUtils {
|
||||
|
||||
public List<ColoredLine> quadPointAsLines(QuadPoint rect, boolean tight) {
|
||||
|
||||
if (tight) {
|
||||
return List.of(new ColoredLine(new Line2D.Double(rect.a(), rect.b()), Color.GREEN, 1),
|
||||
new ColoredLine(new Line2D.Double(rect.b(), rect.c()), Color.GREEN, 1),
|
||||
new ColoredLine(new Line2D.Double(rect.c(), rect.d()), Color.GREEN, 1),
|
||||
new ColoredLine(new Line2D.Double(rect.d(), rect.a()), Color.GREEN, 1));
|
||||
}
|
||||
|
||||
return List.of(new ColoredLine(new Line2D.Double(rect.a(), rect.b()), Color.BLUE, 1),
|
||||
new ColoredLine(new Line2D.Double(rect.b(), rect.c()), Color.BLUE, 1),
|
||||
new ColoredLine(new Line2D.Double(rect.c(), rect.d()), Color.BLUE, 1),
|
||||
new ColoredLine(new Line2D.Double(rect.d(), rect.a()), Color.BLUE, 1));
|
||||
}
|
||||
|
||||
|
||||
public List<ColoredLine> quadPointAsLines(QuadPoint rect, Color color) {
|
||||
|
||||
return List.of(new ColoredLine(new Line2D.Double(rect.a(), rect.b()), color, 1),
|
||||
new ColoredLine(new Line2D.Double(rect.b(), rect.c()), color, 1),
|
||||
new ColoredLine(new Line2D.Double(rect.c(), rect.d()), color, 1),
|
||||
new ColoredLine(new Line2D.Double(rect.d(), rect.a()), color, 1));
|
||||
}
|
||||
|
||||
|
||||
public static Line2D transform(Line2D line2D, AffineTransform affineTransform) {
|
||||
|
||||
var p1 = affineTransform.transform(line2D.getP1(), null);
|
||||
var p2 = affineTransform.transform(line2D.getP2(), null);
|
||||
return new Line2D.Double(p1, p2);
|
||||
}
|
||||
|
||||
|
||||
public static double length(Line2D line2D) {
|
||||
|
||||
return line2D.getP1().distance(line2D.getP2());
|
||||
}
|
||||
|
||||
|
||||
public static Line2D findClosestMidpointLine(QuadPoint quad1, QuadPoint quad2) {
|
||||
|
||||
List<Line2D> lines1 = quad1.asLines()
|
||||
.toList();
|
||||
List<Line2D> lines2 = quad2.asLines()
|
||||
.toList();
|
||||
|
||||
Line2D closestLine1 = null;
|
||||
Line2D closestLine2 = null;
|
||||
double minDistance = Double.MAX_VALUE;
|
||||
|
||||
for (Line2D line1 : lines1) {
|
||||
for (Line2D line2 : lines2) {
|
||||
double distance = lineDistance(line1, line2);
|
||||
if (distance < minDistance) {
|
||||
minDistance = distance;
|
||||
closestLine1 = line1;
|
||||
closestLine2 = line2;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (closestLine1 == null || closestLine2 == null) {
|
||||
throw new IllegalStateException("Could not find closest lines");
|
||||
}
|
||||
|
||||
Point2D midpoint1 = getMidpoint(closestLine1);
|
||||
Point2D midpoint2 = getMidpoint(closestLine2);
|
||||
|
||||
return new Line2D.Double(midpoint1, midpoint2);
|
||||
}
|
||||
|
||||
|
||||
private static double lineDistance(Line2D line1, Line2D line2) {
|
||||
|
||||
return Math.abs(getMidpoint(line1).distance(getMidpoint(line2)));
|
||||
}
|
||||
|
||||
|
||||
private static Point2D getMidpoint(Line2D line) {
|
||||
|
||||
double x = (line.getX1() + line.getX2()) / 2;
|
||||
double y = (line.getY1() + line.getY2()) / 2;
|
||||
return new Point2D.Double(x, y);
|
||||
}
|
||||
|
||||
|
||||
public static Line2D[] createArrowHead(Line2D line, double arrowLength) {
|
||||
|
||||
Point2D start = line.getP1();
|
||||
Point2D end = line.getP2();
|
||||
|
||||
// Calculate the angle of the line
|
||||
double angle = Math.atan2(end.getY() - start.getY(), end.getX() - start.getX());
|
||||
|
||||
// Calculate the points for the two arrow lines
|
||||
double arrowHeadAngle = Math.PI / 6;
|
||||
double x1 = end.getX() - arrowLength * Math.cos(angle - arrowHeadAngle);
|
||||
double y1 = end.getY() - arrowLength * Math.sin(angle - arrowHeadAngle);
|
||||
double x2 = end.getX() - arrowLength * Math.cos(angle + arrowHeadAngle);
|
||||
double y2 = end.getY() - arrowLength * Math.sin(angle + arrowHeadAngle);
|
||||
|
||||
// Create and return the two arrow lines
|
||||
Line2D arrow1 = new Line2D.Double(end, new Point2D.Double(x1, y1));
|
||||
Line2D arrow2 = new Line2D.Double(end, new Point2D.Double(x2, y2));
|
||||
|
||||
return new Line2D[]{arrow1, arrow2};
|
||||
}
|
||||
|
||||
}
|
||||
@ -1,34 +0,0 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.utils;
|
||||
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import com.iqser.red.service.redaction.v1.server.model.document.nodes.Page;
|
||||
|
||||
import lombok.experimental.UtilityClass;
|
||||
|
||||
@UtilityClass
|
||||
public class BBoxMergingUtility {
|
||||
|
||||
public Map<Page, Rectangle2D> mergeBBoxes(List<Map<Page, Rectangle2D>> bboxesToMerge) {
|
||||
|
||||
Map<Page, Rectangle2D> bBoxPerPage = new HashMap<>();
|
||||
Set<Page> pages = bboxesToMerge.stream()
|
||||
.flatMap(map -> map.keySet()
|
||||
.stream())
|
||||
.collect(Collectors.toSet());
|
||||
for (Page page : pages) {
|
||||
Rectangle2D bBoxOnPage = bboxesToMerge.stream()
|
||||
.filter(childBboxPerPage -> childBboxPerPage.containsKey(page))
|
||||
.map(childBboxPerPage -> childBboxPerPage.get(page))
|
||||
.collect(RectangleTransformations.collectBBox());
|
||||
bBoxPerPage.put(page, bBoxOnPage);
|
||||
}
|
||||
return bBoxPerPage;
|
||||
}
|
||||
|
||||
}
|
||||
@ -2,6 +2,8 @@ package com.knecon.fforesight.service.layoutparser.processor.utils;
|
||||
|
||||
import java.awt.geom.AffineTransform;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.PageInformation;
|
||||
|
||||
import lombok.SneakyThrows;
|
||||
import lombok.experimental.UtilityClass;
|
||||
|
||||
|
||||
@ -4,12 +4,14 @@ import java.awt.geom.Point2D;
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.util.Comparator;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.BoundingBox;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.Cell;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling;
|
||||
|
||||
public class GeometricComparators {
|
||||
|
||||
private static final int COMPARATOR_ROUNDING = 2;
|
||||
static final float VERTICAL_COMPARISON_THRESHOLD = 0.4f;
|
||||
|
||||
public static final Comparator<Point2D> X_FIRST_POINT_COMPARATOR = (point1, point2) -> {
|
||||
|
||||
@ -58,6 +60,17 @@ public class GeometricComparators {
|
||||
return cell1Size.compareTo(cell2Size);
|
||||
};
|
||||
|
||||
public static final Comparator<BoundingBox> CELL_SORTER = (o1, o2) -> {
|
||||
|
||||
if (o1.equals(o2)) {
|
||||
return 0;
|
||||
}
|
||||
if (o1.verticalOverlapPdf(o2) > VERTICAL_COMPARISON_THRESHOLD * ((o1.getHeight() + o2.getHeight()) / 2)) {
|
||||
return Double.compare(o1.getMinX(), o2.getMinX());
|
||||
} else {
|
||||
return Double.compare(o1.getMaxY(), o2.getMaxY());
|
||||
}
|
||||
};
|
||||
public static final Comparator<Rectangle2D> RECTANGLE_SIZE_COMPARATOR = (rect1, rect2) -> {
|
||||
|
||||
Double rect1Size = rect1.getHeight() * rect1.getWidth();
|
||||
|
||||
@ -1,59 +0,0 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.utils;
|
||||
|
||||
import java.awt.geom.Rectangle2D;
|
||||
|
||||
import org.apache.pdfbox.pdmodel.PDPage;
|
||||
import org.apache.pdfbox.pdmodel.common.PDRectangle;
|
||||
|
||||
import com.iqser.red.service.redaction.v1.server.model.document.nodes.Page;
|
||||
|
||||
public record PageInformation(Rectangle2D mediabox, int number, int rotationDegrees) {
|
||||
|
||||
public static PageInformation fromPDPage(int pageNum, PDPage page) {
|
||||
|
||||
PDRectangle mediaBox = page.getMediaBox();
|
||||
return new PageInformation(new Rectangle2D.Double(mediaBox.getLowerLeftX(), mediaBox.getLowerLeftY(), mediaBox.getWidth(), mediaBox.getHeight()),
|
||||
pageNum,
|
||||
page.getRotation());
|
||||
}
|
||||
|
||||
|
||||
public static PageInformation fromPage(Page page) {
|
||||
|
||||
return new PageInformation(new Rectangle2D.Double(0, 0, page.getWidth(), page.getHeight()), page.getNumber(), page.getRotation());
|
||||
}
|
||||
|
||||
|
||||
public double height() {
|
||||
|
||||
return mediabox.getHeight();
|
||||
}
|
||||
|
||||
|
||||
public double heightRot() {
|
||||
|
||||
if (rotationDegrees == 90 || rotationDegrees == 270) {
|
||||
return width();
|
||||
}
|
||||
return height();
|
||||
}
|
||||
|
||||
|
||||
public double width() {
|
||||
|
||||
return mediabox.getWidth();
|
||||
}
|
||||
|
||||
|
||||
public double minX() {
|
||||
|
||||
return mediabox.getX();
|
||||
}
|
||||
|
||||
|
||||
public double minY() {
|
||||
|
||||
return mediabox.getY();
|
||||
}
|
||||
|
||||
}
|
||||
@ -1,42 +0,0 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.utils;
|
||||
|
||||
import java.io.BufferedOutputStream;
|
||||
import java.io.File;
|
||||
import java.io.FileOutputStream;
|
||||
import java.io.IOException;
|
||||
import java.util.zip.GZIPOutputStream;
|
||||
|
||||
import com.google.protobuf.Message;
|
||||
import com.google.protobuf.MessageOrBuilder;
|
||||
import com.google.protobuf.Struct;
|
||||
import com.google.protobuf.util.JsonFormat;
|
||||
|
||||
import lombok.SneakyThrows;
|
||||
import lombok.experimental.UtilityClass;
|
||||
|
||||
@UtilityClass
|
||||
public class ProtobufUtil {
|
||||
|
||||
public static String toJson(MessageOrBuilder messageOrBuilder) throws IOException {
|
||||
return JsonFormat.printer().print(messageOrBuilder);
|
||||
}
|
||||
|
||||
@SuppressWarnings("unchecked")
|
||||
public static Message fromJson(String json) throws IOException {
|
||||
Message.Builder structBuilder = Struct.newBuilder();
|
||||
JsonFormat.parser().ignoringUnknownFields().merge(json, structBuilder);
|
||||
return structBuilder.build();
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
public <T extends Message> File serializeToTempFile(T any) {
|
||||
var tempFile = File.createTempFile("storage-protobuf", ".data");
|
||||
|
||||
try (var fos = new GZIPOutputStream(new BufferedOutputStream(new FileOutputStream(tempFile)))) {
|
||||
any.writeTo(fos);
|
||||
return tempFile;
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
@ -2,10 +2,12 @@ package com.knecon.fforesight.service.layoutparser.processor.utils;
|
||||
|
||||
import static java.lang.String.format;
|
||||
|
||||
import java.awt.geom.AffineTransform;
|
||||
import java.awt.geom.Point2D;
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.awt.geom.RectangularShape;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collection;
|
||||
import java.util.Collections;
|
||||
import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
@ -125,7 +127,7 @@ public class RectangleTransformations {
|
||||
}
|
||||
|
||||
|
||||
public static Rectangle2D rectangle2DBBox(List<Rectangle2D> rectangle2DList) {
|
||||
public static Rectangle2D rectangle2DBBox(Collection<Rectangle2D> rectangle2DList) {
|
||||
|
||||
return rectangle2DList.stream()
|
||||
.collect(new Rectangle2DBBoxCollector());
|
||||
@ -185,6 +187,12 @@ public class RectangleTransformations {
|
||||
}
|
||||
|
||||
|
||||
public static Rectangle2D transform(Rectangle2D rect, AffineTransform transform) {
|
||||
|
||||
return transform.createTransformedShape(rect).getBounds2D();
|
||||
}
|
||||
|
||||
|
||||
private static class Rectangle2DBBoxCollector implements Collector<Rectangle2D, Rectangle2DBBoxCollector.BBox, Rectangle2D> {
|
||||
|
||||
@Override
|
||||
|
||||
@ -13,7 +13,7 @@ import java.util.stream.Collectors;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.AngleFilter;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.Line;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.TextBoundingBox;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.UnionFind;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.UnionFind;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.docstrum.utils.DoubleUtils;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.docstrum.utils.FastAtan2;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
|
||||
@ -46,6 +46,12 @@ public class TextPositionOperations {
|
||||
return sortUsingLineDetection(sequences);
|
||||
}
|
||||
|
||||
public List<Word> mergeAndSort(TextPageBlock textBlocks) {
|
||||
|
||||
var sequences = new HashSet<>(textBlocks.getWords());
|
||||
return sortUsingLineDetection(sequences);
|
||||
}
|
||||
|
||||
|
||||
public List<Word> sort(List<Word> sequences) {
|
||||
|
||||
|
||||
@ -1,44 +0,0 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.utils;
|
||||
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
|
||||
// simple implementation of a disjoint-set data structure
|
||||
// https://en.wikipedia.org/wiki/Disjoint-set_data_structure
|
||||
public class UnionFind<T> {
|
||||
|
||||
Map<T, T> parents = new HashMap<>();
|
||||
Map<T, Integer> numberOfObjects = new HashMap<>();
|
||||
|
||||
|
||||
public T find(T node) {
|
||||
|
||||
if (!parents.containsKey(node)) {
|
||||
parents.put(node, node);
|
||||
numberOfObjects.put(node, 1);
|
||||
}
|
||||
if (!node.equals(parents.get(node))) {
|
||||
parents.put(node, find(parents.get(node)));
|
||||
}
|
||||
return parents.get(node);
|
||||
}
|
||||
|
||||
|
||||
public void union(T node1, T node2) {
|
||||
|
||||
T root1 = find(node1);
|
||||
T root2 = find(node2);
|
||||
|
||||
if (!root1.equals(root2)) {
|
||||
if (numberOfObjects.getOrDefault(root1, 1) < numberOfObjects.getOrDefault(root2, 1)) {
|
||||
parents.put(root1, root2);
|
||||
numberOfObjects.put(root2, numberOfObjects.get(root2) + numberOfObjects.get(root1));
|
||||
} else {
|
||||
parents.put(root2, root1);
|
||||
numberOfObjects.put(root1, numberOfObjects.get(root1) + numberOfObjects.get(root2));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@ -17,7 +17,6 @@ import java.util.concurrent.atomic.AtomicInteger;
|
||||
import org.apache.pdfbox.pdmodel.documentinterchange.markedcontent.PDMarkedContent;
|
||||
|
||||
import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.Rectangle;
|
||||
import com.iqser.red.service.redaction.v1.server.data.LayoutEngineProto;
|
||||
import com.iqser.red.service.redaction.v1.server.model.document.TextRange;
|
||||
import com.iqser.red.service.redaction.v1.server.model.document.nodes.Document;
|
||||
import com.iqser.red.service.redaction.v1.server.model.document.nodes.LayoutEngine;
|
||||
@ -36,7 +35,7 @@ import com.knecon.fforesight.service.layoutparser.processor.model.text.Word;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.classification.NumberWord;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.CoordinateTransforms;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.MarkedContentUtils;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.PageInformation;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.PageInformation;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations;
|
||||
import com.knecon.fforesight.service.viewerdoc.layers.LayoutDebugLayerConfig;
|
||||
import com.knecon.fforesight.service.viewerdoc.model.ColoredLine;
|
||||
@ -59,7 +58,7 @@ import lombok.experimental.FieldDefaults;
|
||||
@FieldDefaults(level = AccessLevel.PRIVATE)
|
||||
public class LayoutDebugLayer extends LayoutDebugLayerConfig {
|
||||
|
||||
boolean active;
|
||||
boolean active = true;
|
||||
|
||||
Map<Integer, AtomicInteger> outlineObjectsWithoutPointsPerPage = new HashMap<>();
|
||||
|
||||
@ -141,7 +140,7 @@ public class LayoutDebugLayer extends LayoutDebugLayerConfig {
|
||||
}
|
||||
|
||||
|
||||
public void addCellVisualizations(List<? extends BoundingBox> cells, int pageNumber) {
|
||||
public void addCellVisualizations(Collection<? extends BoundingBox> cells, int pageNumber, Color color) {
|
||||
|
||||
if (!active) {
|
||||
return;
|
||||
@ -149,7 +148,7 @@ public class LayoutDebugLayer extends LayoutDebugLayerConfig {
|
||||
VisualizationsOnPage visualizationsOnPage = getOrCreateVisualizationsOnPage(pageNumber, this.cells);
|
||||
visualizationsOnPage.getColoredRectangles()
|
||||
.addAll(cells.stream()
|
||||
.map(cell -> new ColoredRectangle(cell.getBBoxPdf(), CELLS_COLOR, 1))
|
||||
.map(cell -> new ColoredRectangle(cell.getBBoxPdf(), color == null ? CELLS_COLOR : color, 1))
|
||||
.toList());
|
||||
}
|
||||
|
||||
@ -211,7 +210,7 @@ public class LayoutDebugLayer extends LayoutDebugLayerConfig {
|
||||
}
|
||||
|
||||
|
||||
public void addTextBlockVisualizations(List<AbstractPageBlock> textPageBlocks, int page) {
|
||||
public void addTextBlockVisualizations(List<? extends AbstractPageBlock> textPageBlocks, int page) {
|
||||
|
||||
if (!active) {
|
||||
return;
|
||||
|
||||
@ -26,9 +26,8 @@ import com.iqser.red.service.redaction.v1.server.model.document.nodes.SemanticNo
|
||||
import com.iqser.red.service.redaction.v1.server.model.document.nodes.SuperSection;
|
||||
import com.iqser.red.service.redaction.v1.server.model.document.nodes.Table;
|
||||
import com.iqser.red.service.redaction.v1.server.model.document.nodes.TableCell;
|
||||
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.PageInformation;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.CoordinateTransforms;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.PageInformation;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations;
|
||||
import com.knecon.fforesight.service.viewerdoc.layers.LayoutGridLayerConfig;
|
||||
import com.knecon.fforesight.service.viewerdoc.model.ColoredLine;
|
||||
@ -93,19 +92,10 @@ public class LayoutGrid extends LayoutGridLayerConfig {
|
||||
public void addTreeId(SemanticNode semanticNode) {
|
||||
|
||||
Page page = semanticNode.getFirstPage();
|
||||
if (semanticNode.getBBox()
|
||||
.get(page) == null) {
|
||||
if (semanticNode.getBBox().get(page) == null) {
|
||||
return;
|
||||
}
|
||||
addPlacedText(page,
|
||||
semanticNode.getBBox()
|
||||
.get(page),
|
||||
semanticNode.getBBox()
|
||||
.get(page),
|
||||
buildTreeIdString(semanticNode),
|
||||
1,
|
||||
treeIds,
|
||||
TREEID_COLOR);
|
||||
addPlacedText(page, semanticNode.getBBox().get(page), semanticNode.getBBox().get(page), buildTreeIdString(semanticNode), 1, treeIds, TREEID_COLOR);
|
||||
}
|
||||
|
||||
|
||||
@ -134,8 +124,7 @@ public class LayoutGrid extends LayoutGridLayerConfig {
|
||||
.toList();
|
||||
Integer maxChildDepth = subSections.stream()
|
||||
.map(node -> node.getTreeId().size())
|
||||
.max(Integer::compareTo)
|
||||
.orElse(section.getTreeId().size());
|
||||
.max(Integer::compareTo).orElse(section.getTreeId().size());
|
||||
int ownDepth = section.getTreeId().size();
|
||||
|
||||
Page firstPage = section.getFirstPage();
|
||||
@ -321,8 +310,7 @@ public class LayoutGrid extends LayoutGridLayerConfig {
|
||||
Visualizations visualizations = semanticNode.getType().equals(NodeType.TABLE_OF_CONTENTS) ? toc : sections;
|
||||
List<ColoredLine> coloredLines = getOrCreateVisualizationsOnPage(page.getNumber(), visualizations).getColoredLines();
|
||||
int lineWidthModifier = maxChildDepth - ownDepth;
|
||||
Rectangle2D r = RectangleTransformations.pad(semanticNode.getBBox()
|
||||
.get(page), LINE_WIDTH * (1 + lineWidthModifier), LINE_WIDTH * (1 + lineWidthModifier));
|
||||
Rectangle2D r = RectangleTransformations.pad(semanticNode.getBBox().get(page), LINE_WIDTH * (1 + lineWidthModifier), LINE_WIDTH * (1 + lineWidthModifier));
|
||||
|
||||
SemanticNode highestParent = semanticNode.getHighestParent();
|
||||
Rectangle2D highestParentRect = rectangleMap.get(new RectangleIdentifier(highestParent.getTreeId(), page.getNumber()));
|
||||
@ -371,8 +359,7 @@ public class LayoutGrid extends LayoutGridLayerConfig {
|
||||
List<Double> ys = yStream.collect(Collectors.toList());
|
||||
ys.remove(0);
|
||||
|
||||
Rectangle2D tableBBox = table.getBBox()
|
||||
.get(page);
|
||||
Rectangle2D tableBBox = table.getBBox().get(page);
|
||||
List<ColoredLine> coloredLines = getOrCreateVisualizationsOnPage(page.getNumber(), tables).getColoredLines();
|
||||
|
||||
xs.forEach(x -> {
|
||||
|
||||
@ -0,0 +1,60 @@
|
||||
package com.knecon.fforesight.service.layoutparser.processor.services.tables;
|
||||
|
||||
import static org.junit.jupiter.api.Assertions.*;
|
||||
|
||||
import java.awt.geom.AffineTransform;
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.util.Iterator;
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
import java.util.stream.Stream;
|
||||
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.Cell;
|
||||
|
||||
class TableAreaFillerTest {
|
||||
|
||||
@Test
|
||||
void findMissingCells() {
|
||||
|
||||
Rectangle2D area = new Rectangle2D.Double(0, 0, 2, 2);
|
||||
List<Rectangle2D> rectangles = List.of(new Rectangle2D.Double(0, 0, 1, 1), new Rectangle2D.Double(1, 1, 1, 1), new Rectangle2D.Double(1, 0, 1, 1));
|
||||
Set<Rectangle2D> missing = TableAreaFiller.findMissingRects(rectangles, area);
|
||||
|
||||
assertEquals(1, missing.size());
|
||||
assertEquals(new Rectangle2D.Double(0, 1, 1, 1), missing.iterator().next());
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
void findMissingCells2() {
|
||||
|
||||
Rectangle2D area = new Rectangle2D.Double(0, 0, 3, 3);
|
||||
List<Rectangle2D> rectangles = List.of(new Rectangle2D.Double(0, 0, 1, 1),
|
||||
new Rectangle2D.Double(1, 0, 1, 1),
|
||||
new Rectangle2D.Double(2, 0, 1, 1),
|
||||
new Rectangle2D.Double(0, 1, 1, 1),
|
||||
new Rectangle2D.Double(1, 1, 1, 1),
|
||||
new Rectangle2D.Double(2, 1, 1, 1));
|
||||
|
||||
|
||||
var missing = TableAreaFiller.findMissingRects(rectangles, area);
|
||||
assertEquals(1, missing.size());
|
||||
assertEquals(new Rectangle2D.Double(0, 2, 3, 1), missing.iterator().next());
|
||||
}
|
||||
|
||||
@Test
|
||||
void findMissingCells3() {
|
||||
|
||||
Rectangle2D area = new Rectangle2D.Double(0, 0, 2, 2);
|
||||
List<Rectangle2D> rectangles = List.of(new Rectangle2D.Double(0, 0, 1, 1));
|
||||
Set<Rectangle2D> missing = TableAreaFiller.findMissingRects(rectangles, area);
|
||||
|
||||
assertEquals(2, missing.size());
|
||||
Iterator<Rectangle2D> iterator = missing.iterator();
|
||||
assertEquals(new Rectangle2D.Double(0, 1, 2, 1), iterator.next());
|
||||
assertEquals(new Rectangle2D.Double(1, 0, 1, 1), iterator.next());
|
||||
}
|
||||
|
||||
}
|
||||
@ -75,6 +75,7 @@ public abstract class AbstractTest {
|
||||
protected final static String TENANT_ID = "tenant";
|
||||
protected final static String VIEWER_DOCUMENT_ID = "viewer";
|
||||
protected final static String SIMPLIFIED_ID = "simplified";
|
||||
protected final static String IDP_ID = "idp";
|
||||
|
||||
|
||||
protected LayoutParsingRequest buildStandardLayoutParsingRequest() {
|
||||
@ -117,7 +118,14 @@ public abstract class AbstractTest {
|
||||
|
||||
public static LayoutParsingRequest buildDefaultLayoutParsingRequest(String fileName, LayoutParsingType layoutParsingType, boolean debug) {
|
||||
|
||||
return buildDefaultLayoutParsingRequest(fileName, layoutParsingType, debug, false);
|
||||
}
|
||||
|
||||
|
||||
public static LayoutParsingRequest buildDefaultLayoutParsingRequest(String fileName, LayoutParsingType layoutParsingType, boolean debug, boolean withIdpResult) {
|
||||
|
||||
var identifier = debug ? Map.of("fileId", fileName, "debug", "true") : Map.of("fileId", fileName);
|
||||
Optional<String> idpResultStorageId = withIdpResult ? Optional.of(fileName + IDP_ID) : Optional.empty();
|
||||
return LayoutParsingRequest.builder()
|
||||
.identifier(identifier)
|
||||
.layoutParsingType(layoutParsingType)
|
||||
@ -132,6 +140,7 @@ public abstract class AbstractTest {
|
||||
.simplifiedTextStorageId(fileName + SIMPLIFIED_ID)
|
||||
.viewerDocumentStorageId(fileName + VIEWER_DOCUMENT_ID)
|
||||
.documentMarkdownFileStorageId(Optional.of(fileName + MARKDOWN_FILE_ID))
|
||||
.idpResultStorageId(idpResultStorageId)
|
||||
.build();
|
||||
}
|
||||
|
||||
|
||||
@ -34,6 +34,7 @@ import com.knecon.fforesight.service.layoutparser.processor.services.factory.Doc
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.mapper.TaasDocumentDataMapper;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.DrawingOptions;
|
||||
import com.knecon.fforesight.service.layoutparser.server.utils.visualizations.PdfDraw;
|
||||
import com.knecon.fforesight.service.ocr.v1.api.model.IdpResult;
|
||||
|
||||
import lombok.SneakyThrows;
|
||||
|
||||
@ -51,11 +52,12 @@ public class BdrJsonBuildTest extends AbstractTest {
|
||||
|
||||
return DocumentGraphFactory.buildDocumentGraph(LayoutParsingType.CLARIFYND,
|
||||
layoutParsingPipeline.parseLayout(LayoutParsingType.CLARIFYND,
|
||||
file,
|
||||
new ImageServiceResponse(),
|
||||
new TableServiceResponse(),
|
||||
new VisualLayoutParsingResponse(),
|
||||
Map.of("file",file.toString()))).document();
|
||||
file,
|
||||
new ImageServiceResponse(),
|
||||
new TableServiceResponse(),
|
||||
IdpResult.empty(),
|
||||
new VisualLayoutParsingResponse(),
|
||||
Map.of("file",file.toString()))).document();
|
||||
}
|
||||
|
||||
|
||||
|
||||
@ -16,6 +16,7 @@ import com.knecon.fforesight.service.layoutparser.processor.model.DocumentWithVi
|
||||
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableServiceResponse;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.VisualLayoutParsingResponse;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.factory.DocumentGraphFactory;
|
||||
import com.knecon.fforesight.service.ocr.v1.api.model.IdpResult;
|
||||
|
||||
import lombok.SneakyThrows;
|
||||
|
||||
@ -39,6 +40,7 @@ public abstract class BuildDocumentTest extends AbstractTest {
|
||||
fileResource,
|
||||
layoutParsingStorageService.getImagesFile(IMAGE_FILE_ID),
|
||||
new TableServiceResponse(),
|
||||
IdpResult.empty(),
|
||||
new VisualLayoutParsingResponse(),
|
||||
Map.of("file", filename, "debug", "true"));
|
||||
}
|
||||
@ -63,6 +65,7 @@ public abstract class BuildDocumentTest extends AbstractTest {
|
||||
layoutParsingStorageService.getImagesFile(layoutParsingRequest.imagesFileStorageId()
|
||||
.get()),
|
||||
new TableServiceResponse(),
|
||||
IdpResult.empty(),
|
||||
new VisualLayoutParsingResponse(),
|
||||
layoutParsingRequest.identifier()));
|
||||
} else {
|
||||
|
||||
@ -30,6 +30,7 @@ import com.knecon.fforesight.service.layoutparser.processor.python_api.model.ima
|
||||
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableServiceResponse;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.VisualLayoutParsingResponse;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.factory.DocumentGraphFactory;
|
||||
import com.knecon.fforesight.service.ocr.v1.api.model.IdpResult;
|
||||
import com.knecon.fforesight.tenantcommons.TenantsClient;
|
||||
|
||||
import lombok.AllArgsConstructor;
|
||||
@ -106,6 +107,7 @@ public class HeadlinesGoldStandardIntegrationTest {
|
||||
pdfFileResource.getFile(),
|
||||
new ImageServiceResponse(),
|
||||
new TableServiceResponse(),
|
||||
IdpResult.empty(),
|
||||
new VisualLayoutParsingResponse(),
|
||||
Map.of("file", filePath))).document();
|
||||
var foundHeadlines = documentGraph.streamAllSubNodes()
|
||||
|
||||
@ -1,6 +1,7 @@
|
||||
package com.knecon.fforesight.service.layoutparser.server;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.FileInputStream;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.util.Arrays;
|
||||
@ -24,6 +25,7 @@ import lombok.SneakyThrows;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
|
||||
@Slf4j
|
||||
@Disabled
|
||||
public class LayoutparserEnd2EndTest extends AbstractTest {
|
||||
|
||||
public static final LayoutParsingType LAYOUT_PARSING_TYPE = LayoutParsingType.DOCUMINE_OLD;
|
||||
@ -33,15 +35,24 @@ public class LayoutparserEnd2EndTest extends AbstractTest {
|
||||
|
||||
|
||||
@Test
|
||||
@Disabled
|
||||
public void testLayoutParserEndToEnd() {
|
||||
|
||||
String filePath = "/home/kschuettler/Dokumente/LayoutparsingEvaluation/VV-340050.pdf";
|
||||
String filePath = "/home/kschuettler/Downloads/2021-2048323.pdf";
|
||||
|
||||
runForFile(filePath);
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
public void testLayoutParserEndToEndWithIdpResult() {
|
||||
|
||||
String filePath = "/tmp/OCR_TEST/2009-1048395_50pages_tables.pdf/document.pdf";
|
||||
String idpResultPath = "/tmp/OCR_TEST/2009-1048395_50pages_tables.pdf/idpResult.json";
|
||||
|
||||
runForFile(filePath, idpResultPath);
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
@Disabled
|
||||
@SneakyThrows
|
||||
@ -62,9 +73,15 @@ public class LayoutparserEnd2EndTest extends AbstractTest {
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
private void runForFile(String filePath) {
|
||||
|
||||
runForFile(filePath, null);
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
private void runForFile(String filePath, String idpResultPath) {
|
||||
|
||||
String fileName = Path.of(filePath).getFileName().toString();
|
||||
File file;
|
||||
if (filePath.startsWith("files")) { // from resources
|
||||
@ -73,7 +90,13 @@ public class LayoutparserEnd2EndTest extends AbstractTest {
|
||||
file = new File(filePath);
|
||||
}
|
||||
|
||||
LayoutParsingRequest layoutParsingRequest = buildDefaultLayoutParsingRequest(fileName, LAYOUT_PARSING_TYPE, true);
|
||||
LayoutParsingRequest layoutParsingRequest = buildDefaultLayoutParsingRequest(fileName, LAYOUT_PARSING_TYPE, true, true);
|
||||
|
||||
if (layoutParsingRequest.idpResultStorageId().isPresent() && idpResultPath != null) {
|
||||
try (var in = new FileInputStream(idpResultPath)) {
|
||||
storageService.storeObject(TENANT_ID, layoutParsingRequest.idpResultStorageId().get(), in);
|
||||
}
|
||||
}
|
||||
|
||||
prepareStorage(layoutParsingRequest, file);
|
||||
|
||||
|
||||
@ -32,6 +32,7 @@ import com.knecon.fforesight.service.layoutparser.processor.python_api.model.tab
|
||||
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.VisualLayoutParsingResponse;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.factory.DocumentGraphFactory;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.visualization.LayoutGridService;
|
||||
import com.knecon.fforesight.service.ocr.v1.api.model.IdpResult;
|
||||
import com.knecon.fforesight.service.viewerdoc.service.PDFTronViewerDocumentService;
|
||||
|
||||
import lombok.SneakyThrows;
|
||||
@ -192,6 +193,7 @@ public class OutlineDetectionTest extends AbstractTest {
|
||||
fileResource,
|
||||
layoutParsingStorageService.getImagesFile(IMAGE_FILE_ID),
|
||||
new TableServiceResponse(),
|
||||
IdpResult.empty(),
|
||||
new VisualLayoutParsingResponse(),
|
||||
Map.of("file", filename, "debug", "true"));
|
||||
}
|
||||
@ -209,6 +211,7 @@ public class OutlineDetectionTest extends AbstractTest {
|
||||
layoutParsingStorageService.getImagesFile(layoutParsingRequest.imagesFileStorageId()
|
||||
.get()),
|
||||
new TableServiceResponse(),
|
||||
IdpResult.empty(),
|
||||
new VisualLayoutParsingResponse(),
|
||||
layoutParsingRequest.identifier()));
|
||||
} else {
|
||||
|
||||
@ -20,6 +20,7 @@ import com.knecon.fforesight.service.layoutparser.processor.python_api.model.tab
|
||||
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.VisualLayoutParsingResponse;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.SimplifiedSectionTextService;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.factory.DocumentGraphFactory;
|
||||
import com.knecon.fforesight.service.ocr.v1.api.model.IdpResult;
|
||||
|
||||
import lombok.SneakyThrows;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
@ -61,6 +62,7 @@ public class SimplifiedTextServiceTest extends AbstractTest {
|
||||
file,
|
||||
new ImageServiceResponse(),
|
||||
new TableServiceResponse(),
|
||||
IdpResult.empty(),
|
||||
new VisualLayoutParsingResponse(),
|
||||
Map.of("file", file.toString()))).document();
|
||||
}
|
||||
|
||||
@ -21,6 +21,7 @@ import com.knecon.fforesight.service.layoutparser.processor.python_api.model.tab
|
||||
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.VisualLayoutParsingResponse;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.factory.DocumentGraphFactory;
|
||||
import com.knecon.fforesight.service.layoutparser.server.BuildDocumentTest;
|
||||
import com.knecon.fforesight.service.ocr.v1.api.model.IdpResult;
|
||||
|
||||
import lombok.SneakyThrows;
|
||||
|
||||
@ -58,11 +59,12 @@ public class DocumentGraphJsonWritingTest extends BuildDocumentTest {
|
||||
|
||||
Document documentGraph = DocumentGraphFactory.buildDocumentGraph(LayoutParsingType.REDACT_MANAGER_WITHOUT_DUPLICATE_PARAGRAPH,
|
||||
layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER_WITHOUT_DUPLICATE_PARAGRAPH,
|
||||
filename.toFile(),
|
||||
new ImageServiceResponse(),
|
||||
new TableServiceResponse(),
|
||||
new VisualLayoutParsingResponse(),
|
||||
Map.of("file",filename.toFile().toString()))).document();
|
||||
filename.toFile(),
|
||||
new ImageServiceResponse(),
|
||||
new TableServiceResponse(),
|
||||
IdpResult.empty(),
|
||||
new VisualLayoutParsingResponse(),
|
||||
Map.of("file",filename.toFile().toString()))).document();
|
||||
|
||||
DocumentData documentData = DocumentDataMapper.toDocumentData(documentGraph);
|
||||
ObjectMapper mapper = ObjectMapperFactory.create();
|
||||
|
||||
@ -29,7 +29,7 @@ public class DocumentGraphMappingTest extends BuildDocumentTest {
|
||||
@SneakyThrows
|
||||
public void testGraphMapping() {
|
||||
|
||||
String filename = "files/syngenta/CustomerFiles/SYNGENTA_EFSA_sanitisation_GFL_v1_moreSections.pdf";
|
||||
String filename = "files/syngenta/CustomerFiles/Fludioxonil_duplicates.pdf";
|
||||
|
||||
Document document = buildGraph(filename);
|
||||
DocumentData documentData = DocumentDataMapper.toDocumentData(document);
|
||||
|
||||
@ -17,8 +17,9 @@ import com.knecon.fforesight.service.layoutparser.processor.python_api.model.tab
|
||||
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.VisualLayoutParsingResponse;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.factory.DocumentGraphFactory;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.visualization.LayoutGridService;
|
||||
import com.knecon.fforesight.service.layoutparser.server.PDFNetInitializer;
|
||||
import com.knecon.fforesight.service.layoutparser.server.BuildDocumentTest;
|
||||
import com.knecon.fforesight.service.layoutparser.server.PDFNetInitializer;
|
||||
import com.knecon.fforesight.service.ocr.v1.api.model.IdpResult;
|
||||
import com.knecon.fforesight.service.viewerdoc.service.PDFTronViewerDocumentService;
|
||||
|
||||
import lombok.SneakyThrows;
|
||||
@ -74,6 +75,7 @@ public class ViewerDocumentTest extends BuildDocumentTest {
|
||||
documentFile,
|
||||
new ImageServiceResponse(),
|
||||
tableResponse,
|
||||
IdpResult.empty(),
|
||||
new VisualLayoutParsingResponse(),
|
||||
Map.of("file", Path.of(fileName).getFileName().toFile().toString()));
|
||||
PDFTronViewerDocumentService viewerDocumentService = new PDFTronViewerDocumentService(null);
|
||||
|
||||
@ -39,6 +39,8 @@ import com.knecon.fforesight.service.layoutparser.processor.python_api.model.tab
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.PageContentExtractor;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.factory.DocumentGraphFactory;
|
||||
import com.knecon.fforesight.service.layoutparser.server.AbstractTest;
|
||||
import com.knecon.fforesight.service.ocr.v1.api.model.IdpResult;
|
||||
import com.knecon.fforesight.service.viewerdoc.service.PDFTronViewerDocumentService;
|
||||
|
||||
import lombok.SneakyThrows;
|
||||
|
||||
@ -58,6 +60,7 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
||||
originDocument,
|
||||
new ImageServiceResponse(),
|
||||
tableServiceResponse,
|
||||
IdpResult.empty(),
|
||||
new VisualLayoutParsingResponse(),
|
||||
Map.of("file", "document"));
|
||||
|
||||
@ -103,24 +106,19 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
||||
String textToSearch = "Annex to Regulation 283/2013 Annex to Regulation 284/2013";
|
||||
ClassPathResource pdfFileResource = new ClassPathResource(fileName);
|
||||
|
||||
List<PageContents> textPositionPerPage = PageContentExtractor.getSortedPageContents(fileName);
|
||||
List<PageContents> textPositionPerPage = PageContentExtractor.getDocumentContents(pdfFileResource.getFile(), 4);
|
||||
var textPositions = textPositionPerPage.stream()
|
||||
.flatMap(t -> t.getSortedWords()
|
||||
.flatMap(t -> t.getWords()
|
||||
.stream()
|
||||
.map(Word::toString))
|
||||
.collect(Collectors.joining(" "));
|
||||
assertThat(textPositions.contains(textToSearch)).isFalse();
|
||||
assertThat(textPositions.contains(textToSearch)).isTrue();
|
||||
|
||||
ClassificationDocument classificationDocument = buildClassificationDocument(pdfFileResource.getFile());
|
||||
|
||||
assertThat(classificationDocument.getHeaders()
|
||||
.get(0).getTextBlocks().size()).isEqualTo(3);
|
||||
assertThat(classificationDocument.getHeaders()
|
||||
.get(0).getTextBlocks()
|
||||
.get(0).getWords().size()).isEqualTo(8);
|
||||
assertThat(classificationDocument.getHeaders()
|
||||
.get(0).getTextBlocks()
|
||||
.get(0).toString()).contains(textToSearch);
|
||||
assertThat(classificationDocument.getHeaders().get(0).getTextBlocks().size()).isEqualTo(3);
|
||||
assertThat(classificationDocument.getHeaders().get(0).getTextBlocks().get(0).getWords().size()).isEqualTo(8);
|
||||
assertThat(classificationDocument.getHeaders().get(0).getTextBlocks().get(0).toString()).contains(textToSearch);
|
||||
|
||||
Document document = DocumentGraphFactory.buildDocumentGraph(LayoutParsingType.REDACT_MANAGER_WITHOUT_DUPLICATE_PARAGRAPH, classificationDocument).document();
|
||||
|
||||
@ -216,8 +214,7 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
||||
.stream()
|
||||
.filter(abstractPageBlock -> abstractPageBlock instanceof TablePageBlock))
|
||||
.map(abstractPageBlock -> (TablePageBlock) abstractPageBlock)
|
||||
.toList()
|
||||
.get(0);
|
||||
.toList().get(0);
|
||||
assertThat(table.getColCount()).isEqualTo(6);
|
||||
assertThat(table.getRowCount()).isEqualTo(13);
|
||||
assertThat(table.getRows()
|
||||
@ -246,8 +243,7 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
||||
.stream()
|
||||
.filter(abstractPageBlock -> abstractPageBlock instanceof TablePageBlock))
|
||||
.map(abstractPageBlock -> (TablePageBlock) abstractPageBlock)
|
||||
.toList()
|
||||
.get(0);
|
||||
.toList().get(0);
|
||||
assertThat(firstTable.getColCount()).isEqualTo(8);
|
||||
assertThat(firstTable.getRowCount()).isEqualTo(1);
|
||||
TablePageBlock secondTable = document.getSectionTree().getAllTableOfContentItems()
|
||||
@ -256,12 +252,10 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
||||
.stream()
|
||||
.filter(abstractPageBlock -> abstractPageBlock instanceof TablePageBlock))
|
||||
.map(abstractPageBlock -> (TablePageBlock) abstractPageBlock)
|
||||
.toList()
|
||||
.get(1);
|
||||
.toList().get(1);
|
||||
assertThat(secondTable.getColCount()).isEqualTo(8);
|
||||
assertThat(secondTable.getRowCount()).isEqualTo(2);
|
||||
List<List<Cell>> firstTableHeaderCells = firstTable.getRows()
|
||||
.get(0)
|
||||
List<List<Cell>> firstTableHeaderCells = firstTable.getRows().get(0)
|
||||
.stream()
|
||||
.map(Collections::singletonList)
|
||||
.collect(Collectors.toList());
|
||||
@ -293,8 +287,7 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
||||
.stream()
|
||||
.filter(abstractPageBlock -> abstractPageBlock instanceof TablePageBlock))
|
||||
.map(abstractPageBlock -> (TablePageBlock) abstractPageBlock)
|
||||
.toList()
|
||||
.get(0);
|
||||
.toList().get(0);
|
||||
assertThat(firstTable.getColCount()).isEqualTo(9);
|
||||
assertThat(firstTable.getRowCount()).isEqualTo(5);
|
||||
TablePageBlock secondTable = document.getSectionTree().getAllTableOfContentItems()
|
||||
@ -303,12 +296,10 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
||||
.stream()
|
||||
.filter(abstractPageBlock -> abstractPageBlock instanceof TablePageBlock))
|
||||
.map(abstractPageBlock -> (TablePageBlock) abstractPageBlock)
|
||||
.toList()
|
||||
.get(1);
|
||||
.toList().get(1);
|
||||
assertThat(secondTable.getColCount()).isEqualTo(9);
|
||||
assertThat(secondTable.getRowCount()).isEqualTo(6);
|
||||
List<List<Cell>> firstTableHeaderCells = firstTable.getRows()
|
||||
.get(firstTable.getRowCount() - 1)
|
||||
List<List<Cell>> firstTableHeaderCells = firstTable.getRows().get(firstTable.getRowCount() - 1)
|
||||
.stream()
|
||||
.map(Cell::getHeaderCells)
|
||||
.collect(Collectors.toList());
|
||||
@ -340,8 +331,7 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
||||
.stream()
|
||||
.filter(abstractPageBlock -> abstractPageBlock instanceof TablePageBlock))
|
||||
.map(abstractPageBlock -> (TablePageBlock) abstractPageBlock)
|
||||
.toList()
|
||||
.get(0);
|
||||
.toList().get(0);
|
||||
assertThat(firstTable.getColCount()).isEqualTo(8);
|
||||
assertThat(firstTable.getRowCount()).isEqualTo(1);
|
||||
TablePageBlock secondTable = document.getSectionTree().getAllTableOfContentItems()
|
||||
@ -350,12 +340,10 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
||||
.stream()
|
||||
.filter(abstractPageBlock -> abstractPageBlock instanceof TablePageBlock))
|
||||
.map(abstractPageBlock -> (TablePageBlock) abstractPageBlock)
|
||||
.toList()
|
||||
.get(1);
|
||||
.toList().get(1);
|
||||
assertThat(secondTable.getColCount()).isEqualTo(8);
|
||||
assertThat(secondTable.getRowCount()).isEqualTo(6);
|
||||
List<List<Cell>> firstTableHeaderCells = firstTable.getRows()
|
||||
.get(0)
|
||||
List<List<Cell>> firstTableHeaderCells = firstTable.getRows().get(0)
|
||||
.stream()
|
||||
.map(Collections::singletonList)
|
||||
.collect(Collectors.toList());
|
||||
@ -376,10 +364,10 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
||||
|
||||
validateTableSize(document, 4);
|
||||
|
||||
validateTable(document, 0, 1, 1, 0, 0);
|
||||
validateTable(document, 1, 2, 2, 0, 0);
|
||||
validateTable(document, 2, 4, 19, 12, 0);
|
||||
validateTable(document, 3, 2, 12, 0, 0);
|
||||
validateTable(document, 0, 1, 1, 0);
|
||||
validateTable(document, 1, 2, 2, 0);
|
||||
validateTable(document, 2, 2, 12, 0);
|
||||
validateTable(document, 3, 4, 19, 12);
|
||||
|
||||
}
|
||||
|
||||
@ -393,10 +381,10 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
||||
|
||||
validateTableSize(document, 4);
|
||||
|
||||
validateTable(document, 0, 5, 4, 0, 0);
|
||||
validateTable(document, 1, 5, 15, 14, 0);
|
||||
validateTable(document, 2, 5, 14, 11, 0);
|
||||
validateTable(document, 3, 5, 3, 0, 0);
|
||||
validateTable(document, 0, 5, 4, 0);
|
||||
validateTable(document, 1, 5, 15, 14);
|
||||
validateTable(document, 2, 5, 14, 11);
|
||||
validateTable(document, 3, 5, 3, 0);
|
||||
|
||||
}
|
||||
|
||||
@ -410,7 +398,7 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
||||
|
||||
validateTableSize(document, 1);
|
||||
|
||||
validateTable(document, 0, 8, 8, 0, 0);
|
||||
validateTable(document, 0, 8, 8, 0);
|
||||
|
||||
List<List<String>> values = Arrays.asList(Arrays.asList("Annex point Reference within DAR/RAR",
|
||||
"Author, date",
|
||||
@ -455,10 +443,10 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
||||
|
||||
validateTableSize(document, 4);
|
||||
|
||||
validateTable(document, 0, 3, 2, 0, 0);
|
||||
validateTable(document, 1, 3, 2, 0, 0);
|
||||
validateTable(document, 2, 3, 3, 0, 0);
|
||||
validateTable(document, 3, 3, 3, 0, 0);
|
||||
validateTable(document, 0, 3, 2, 0);
|
||||
validateTable(document, 1, 3, 2, 0);
|
||||
validateTable(document, 2, 3, 3, 0);
|
||||
validateTable(document, 3, 3, 3, 0);
|
||||
|
||||
}
|
||||
|
||||
@ -473,7 +461,7 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
||||
|
||||
validateTableSize(document, 1);
|
||||
|
||||
validateTable(document, 0, 7, 4, 0, 0);
|
||||
validateTable(document, 0, 7, 4, 0);
|
||||
}
|
||||
|
||||
|
||||
@ -486,7 +474,7 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
||||
|
||||
validateTableSize(document, 1);
|
||||
|
||||
validateTable(document, 0, 7, 4, 0, 0);
|
||||
validateTable(document, 0, 7, 4, 0);
|
||||
}
|
||||
|
||||
|
||||
@ -499,12 +487,12 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
||||
|
||||
validateTableSize(document, 6);
|
||||
|
||||
validateTable(document, 0, 2, 1, 0, 0);
|
||||
validateTable(document, 1, 2, 1, 0, 0);
|
||||
validateTable(document, 2, 2, 5, 0, 0);
|
||||
validateTable(document, 3, 2, 5, 0, 0);
|
||||
validateTable(document, 4, 2, 4, 0, 0);
|
||||
validateTable(document, 5, 2, 1, 0, 0);
|
||||
validateTable(document, 0, 2, 1, 0);
|
||||
validateTable(document, 1, 2, 1, 0);
|
||||
validateTable(document, 2, 2, 5, 0);
|
||||
validateTable(document, 3, 2, 5, 0);
|
||||
validateTable(document, 4, 2, 4, 0);
|
||||
validateTable(document, 5, 2, 1, 0);
|
||||
|
||||
}
|
||||
|
||||
@ -518,9 +506,9 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
||||
|
||||
validateTableSize(document, 3);
|
||||
|
||||
validateTable(document, 0, 7, 9, 0, 0);
|
||||
validateTable(document, 1, 2, 1, 0, 0);
|
||||
validateTable(document, 2, 2, 10, 0, 0);
|
||||
validateTable(document, 0, 7, 9, 0);
|
||||
validateTable(document, 1, 2, 1, 0);
|
||||
validateTable(document, 2, 2, 10, 0);
|
||||
|
||||
}
|
||||
|
||||
@ -533,7 +521,7 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
||||
ClassificationDocument document = buildClassificationDocument(pdfFileResource.getFile());
|
||||
|
||||
validateTableSize(document, 1);
|
||||
validateTable(document, 0, 9, 9, 0, 0);
|
||||
validateTable(document, 0, 9, 9, 0);
|
||||
|
||||
}
|
||||
|
||||
@ -547,7 +535,7 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
||||
|
||||
validateTableSize(document, 1);
|
||||
|
||||
validateTable(document, 0, 9, 5, 6, 0);
|
||||
validateTable(document, 0, 9, 5, 6);
|
||||
|
||||
}
|
||||
|
||||
@ -560,7 +548,7 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
||||
ClassificationDocument document = buildClassificationDocument(pdfFileResource.getFile());
|
||||
|
||||
validateTableSize(document, 1);
|
||||
validateTable(document, 0, 9, 6, 7, 0);
|
||||
validateTable(document, 0, 9, 6, 7);
|
||||
|
||||
}
|
||||
|
||||
@ -574,7 +562,7 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
||||
ClassificationDocument document = buildClassificationDocument(pdfFileResource.getFile());
|
||||
|
||||
validateTableSize(document, 1);
|
||||
validateTable(document, 0, 10, 6, 0, 0);
|
||||
validateTable(document, 0, 10, 6, 0);
|
||||
|
||||
}
|
||||
|
||||
@ -588,8 +576,8 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
||||
ClassificationDocument document = buildClassificationDocument(pdfFileResource.getFile());
|
||||
|
||||
validateTableSize(document, 2);
|
||||
validateTable(document, 0, 2, 2, 0, 0);
|
||||
validateTable(document, 1, 1, 1, 0, 0);
|
||||
validateTable(document, 0, 2, 2, 0);
|
||||
validateTable(document, 1, 1, 1, 0);
|
||||
|
||||
}
|
||||
|
||||
@ -604,8 +592,8 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
||||
|
||||
validateTableSize(document, 2);
|
||||
|
||||
validateTable(document, 0, 7, 8, 1, 0);
|
||||
validateTable(document, 1, 7, 8, 1, 0);
|
||||
validateTable(document, 0, 7, 8, 1);
|
||||
validateTable(document, 1, 7, 8, 1);
|
||||
|
||||
}
|
||||
|
||||
@ -620,8 +608,8 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
||||
|
||||
validateTableSize(document, 2);
|
||||
|
||||
validateTable(document, 0, 4, 17, 0, 0);
|
||||
validateTable(document, 1, 7, 12, 0, 0);
|
||||
validateTable(document, 0, 4, 17, 0);
|
||||
validateTable(document, 1, 7, 12, 0);
|
||||
|
||||
}
|
||||
|
||||
@ -636,8 +624,8 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
||||
|
||||
validateTableSize(document, 2);
|
||||
|
||||
validateTable(document, 0, 5, 14, 4, 0);
|
||||
validateTable(document, 1, 7, 12, 0, 0);
|
||||
validateTable(document, 0, 5, 14, 4);
|
||||
validateTable(document, 1, 7, 12, 0);
|
||||
|
||||
}
|
||||
|
||||
@ -651,8 +639,8 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
||||
|
||||
validateTableSize(document, 2);
|
||||
|
||||
validateTable(document, 0, 5, 17, 3, 0);
|
||||
validateTable(document, 1, 5, 16, 2, 0);
|
||||
validateTable(document, 0, 5, 17, 3);
|
||||
validateTable(document, 1, 5, 16, 2);
|
||||
|
||||
}
|
||||
|
||||
@ -666,10 +654,10 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
||||
|
||||
validateTableSize(document, 4);
|
||||
|
||||
validateTable(document, 0, 4, 4, 0, 0);
|
||||
validateTable(document, 1, 1, 1, 0, 0);
|
||||
validateTable(document, 2, 2, 3, 0, 0);
|
||||
validateTable(document, 3, 1, 1, 0, 0);
|
||||
validateTable(document, 0, 4, 4, 0);
|
||||
validateTable(document, 1, 1, 1, 0);
|
||||
validateTable(document, 2, 2, 3, 0);
|
||||
validateTable(document, 3, 1, 1, 0);
|
||||
|
||||
}
|
||||
|
||||
@ -684,7 +672,7 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
||||
|
||||
validateTableSize(document, 1);
|
||||
|
||||
validateTable(document, 0, 11, 8, 0, 0);
|
||||
validateTable(document, 0, 11, 8, 0);
|
||||
|
||||
}
|
||||
|
||||
@ -699,8 +687,8 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
||||
|
||||
validateTableSize(document, 2);
|
||||
|
||||
validateTable(document, 0, 6, 8, 0, 0);
|
||||
validateTable(document, 1, 6, 8, 0, 0);
|
||||
validateTable(document, 0, 6, 8, 0);
|
||||
validateTable(document, 1, 6, 8, 0);
|
||||
|
||||
}
|
||||
|
||||
@ -714,7 +702,7 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
||||
|
||||
validateTableSize(document, 1);
|
||||
|
||||
validateTable(document, 0, 9, 5, 2, 0);
|
||||
validateTable(document, 0, 9, 5, 2);
|
||||
|
||||
}
|
||||
|
||||
@ -728,7 +716,7 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
||||
|
||||
validateTableSize(document, 1);
|
||||
|
||||
validateTable(document, 0, 3, 5, 0, 0);
|
||||
validateTable(document, 0, 3, 5, 0);
|
||||
|
||||
}
|
||||
|
||||
@ -742,7 +730,7 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
||||
|
||||
validateTableSize(document, 1);
|
||||
|
||||
validateTable(document, 0, 6, 8, 0, 0);
|
||||
validateTable(document, 0, 6, 8, 0);
|
||||
}
|
||||
|
||||
|
||||
@ -755,10 +743,10 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
||||
|
||||
validateTableSize(document, 4);
|
||||
|
||||
validateTable(document, 0, 3, 3, 0, 0);
|
||||
validateTable(document, 1, 3, 6, 2, 0);
|
||||
validateTable(document, 2, 3, 3, 1, 0);
|
||||
validateTable(document, 3, 3, 3, 0, 0);
|
||||
validateTable(document, 0, 3, 6, 0);
|
||||
validateTable(document, 1, 3, 3, 0);
|
||||
validateTable(document, 2, 3, 3, 0);
|
||||
validateTable(document, 3, 3, 3, 0);
|
||||
|
||||
}
|
||||
|
||||
@ -772,12 +760,12 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
||||
|
||||
validateTableSize(document, 6);
|
||||
|
||||
validateTable(document, 0, 5, 5, 0, 0);
|
||||
validateTable(document, 1, 5, 6, 0, 0);
|
||||
validateTable(document, 2, 5, 5, 0, 0);
|
||||
validateTable(document, 3, 5, 5, 0, 0);
|
||||
validateTable(document, 4, 5, 5, 0, 0);
|
||||
validateTable(document, 5, 5, 5, 0, 0);
|
||||
validateTable(document, 0, 5, 6, 0);
|
||||
validateTable(document, 1, 5, 5, 0);
|
||||
validateTable(document, 2, 5, 5, 0);
|
||||
validateTable(document, 3, 5, 5, 0);
|
||||
validateTable(document, 4, 5, 5, 0);
|
||||
validateTable(document, 5, 5, 5, 0);
|
||||
|
||||
}
|
||||
|
||||
@ -791,7 +779,7 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
||||
|
||||
validateTableSize(document, 1);
|
||||
|
||||
validateTable(document, 0, 6, 5, 0, 0);
|
||||
validateTable(document, 0, 6, 5, 0);
|
||||
|
||||
}
|
||||
|
||||
@ -805,7 +793,7 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
||||
|
||||
validateTableSize(document, 1);
|
||||
|
||||
validateTable(document, 0, 5, 8, 1, 0);
|
||||
validateTable(document, 0, 5, 8, 0);
|
||||
|
||||
}
|
||||
|
||||
@ -816,13 +804,14 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
||||
ClassPathResource pdfFileResource = new ClassPathResource("files/syngenta/CustomerFiles/SinglePages/T5_Page16_VV-640252.pdf");
|
||||
|
||||
ClassificationDocument document = buildClassificationDocument(pdfFileResource.getFile());
|
||||
|
||||
validateTableSize(document, 5);
|
||||
validateTable(document, 0, 1, 1, 0, 0);
|
||||
validateTable(document, 1, 1, 1, 0, 0);
|
||||
validateTable(document, 2, 1, 1, 0, 0);
|
||||
validateTable(document, 3, 1, 1, 0, 0);
|
||||
validateTable(document, 4, 1, 1, 0, 0);
|
||||
validateTableSize(document, 6);
|
||||
// does not make sense to assert anything here other than that it runs. This is not a Table and completely breaks the current table detection logic
|
||||
// viewerDocumentService.addLayerGroups(pdfFileResource.getFile(), new File("/tmp/cellDebug.pdf"), List.of(document.getLayoutDebugLayer()));
|
||||
// validateTable(document, 0, 1, 1, 0);
|
||||
// validateTable(document, 1, 1, 1, 0);
|
||||
// validateTable(document, 2, 1, 1, 3);
|
||||
// validateTable(document, 3, 1, 1, 0);
|
||||
// validateTable(document, 4, 1, 1, 0);
|
||||
|
||||
}
|
||||
|
||||
@ -836,7 +825,7 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
||||
|
||||
validateTableSize(document, 1);
|
||||
|
||||
validateTable(document, 0, 6, 6, 5, 0);
|
||||
validateTable(document, 0, 6, 6, 5);
|
||||
|
||||
}
|
||||
|
||||
@ -869,7 +858,7 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
||||
}
|
||||
|
||||
|
||||
private void validateTable(ClassificationDocument document, int tableIndex, int colCount, int rowCount, int emptyCellsCountCorrect, int emptyCellsCountIncorrect) {
|
||||
private void validateTable(ClassificationDocument document, int tableIndex, int colCount, int rowCount, int emptyCellsCountCorrect) {
|
||||
|
||||
TablePageBlock table = document.getSectionTree().getAllTableOfContentItems()
|
||||
.stream()
|
||||
@ -877,8 +866,7 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
||||
.stream()
|
||||
.filter(abstractPageBlock -> abstractPageBlock instanceof TablePageBlock))
|
||||
.map(abstractPageBlock -> (TablePageBlock) abstractPageBlock)
|
||||
.toList()
|
||||
.get(tableIndex);
|
||||
.toList().get(tableIndex);
|
||||
|
||||
List<List<Cell>> rows = table.getRows();
|
||||
int emptyCellsFoundFound = rows.stream()
|
||||
@ -891,7 +879,7 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
||||
for (List<Cell> row : table.getRows()) {
|
||||
row.forEach(r -> System.out.println(r.toString()));
|
||||
}
|
||||
assertThat(emptyCellsFoundFound).isEqualTo(emptyCellsCountCorrect + emptyCellsCountIncorrect);
|
||||
assertThat(emptyCellsFoundFound).isEqualTo(emptyCellsCountCorrect);
|
||||
|
||||
assertThat(table.getColCount()).isEqualTo(colCount);
|
||||
assertThat(table.getRowCount()).isEqualTo(rowCount);
|
||||
@ -907,8 +895,7 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
||||
.stream()
|
||||
.filter(abstractPageBlock -> abstractPageBlock instanceof TablePageBlock))
|
||||
.map(abstractPageBlock -> (TablePageBlock) abstractPageBlock)
|
||||
.toList()
|
||||
.get(tableIndex);
|
||||
.toList().get(tableIndex);
|
||||
List<List<Cell>> rows = table.getRows();
|
||||
|
||||
List<Cell> rowsFlattened = rows.stream()
|
||||
|
||||
@ -6,14 +6,10 @@ import java.util.List;
|
||||
|
||||
import org.junit.jupiter.api.Disabled;
|
||||
import org.junit.jupiter.api.Test;
|
||||
import org.springframework.core.io.ClassPathResource;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.GapInformation;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.PageContents;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.PageInformation;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.DividingColumnDetectionService;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.GapDetectionService;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.GapsAcrossLinesService;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.PageInformationService;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.experimental.DividingColumnDetectionService;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.PageContentExtractor;
|
||||
import com.knecon.fforesight.service.layoutparser.server.utils.visualizations.PdfDraw;
|
||||
|
||||
@ -21,32 +17,6 @@ import lombok.SneakyThrows;
|
||||
|
||||
class GapAcrossLinesDetectionServiceTest {
|
||||
|
||||
@Test
|
||||
@Disabled
|
||||
@SneakyThrows
|
||||
public void testGapBasedColumnDetection() {
|
||||
|
||||
String filename = "files/basf/CustomerFiles/invisible_tables_test-two-pages_ocred.pdf";
|
||||
var tmpFileName = "/tmp/" + filename.split("/")[2] + "_COLUMNS.pdf";
|
||||
System.out.println("start TextPosition extraction");
|
||||
long start = System.currentTimeMillis();
|
||||
List<PageInformation> pageInformations = PageContentExtractor.getSortedPageContents(filename).stream().map(PageInformationService::build).toList();
|
||||
List<List<Rectangle2D>> columnsPerPage = new LinkedList<>();
|
||||
System.out.printf("Finished TextPosition Extraction in %d ms%n", System.currentTimeMillis() - start);
|
||||
System.out.println("start column detection");
|
||||
start = System.currentTimeMillis();
|
||||
for (PageInformation pageInformation : pageInformations) {
|
||||
GapInformation gapInformation = GapDetectionService.findGapsInLines(pageInformation.getPageContents().getSortedWords(), pageInformation.getMainBodyTextFrame());
|
||||
columnsPerPage.add(GapsAcrossLinesService.detectXGapsAcrossLines(gapInformation, pageInformation.getMainBodyTextFrame()));
|
||||
}
|
||||
System.out.printf("Finished column detection in %d ms%n", System.currentTimeMillis() - start);
|
||||
System.out.println("start draw rectangles");
|
||||
start = System.currentTimeMillis();
|
||||
PdfDraw.drawRectanglesPerPage(filename, columnsPerPage, tmpFileName);
|
||||
System.out.printf("Finished drawing rectangles in %d ms%n", System.currentTimeMillis() - start);
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
@Disabled
|
||||
@SneakyThrows
|
||||
@ -56,7 +26,7 @@ class GapAcrossLinesDetectionServiceTest {
|
||||
var tmpFileName = "/tmp/" + filename.split("/")[2] + "_COLUMNS.pdf";
|
||||
System.out.println("start TextPosition extraction");
|
||||
long start = System.currentTimeMillis();
|
||||
List<PageContents> sortedTextPositionSequencesPerPage = PageContentExtractor.getSortedPageContents(filename);
|
||||
List<PageContents> sortedTextPositionSequencesPerPage = PageContentExtractor.getDocumentContents(new ClassPathResource(filename).getFile(), 4);
|
||||
List<List<Rectangle2D>> columnsPerPage = new LinkedList<>();
|
||||
System.out.printf("Finished TextPosition Extraction in %d ms%n", System.currentTimeMillis() - start);
|
||||
System.out.println("start column detection");
|
||||
|
||||
@ -1,66 +0,0 @@
|
||||
package com.knecon.fforesight.service.layoutparser.server.services;
|
||||
|
||||
import java.awt.geom.Rectangle2D;
|
||||
import java.nio.file.Path;
|
||||
import java.util.Collection;
|
||||
import java.util.Collections;
|
||||
import java.util.List;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.PageInformation;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.Word;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.InvisibleTableDetectionService;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.PageInformationService;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.PageContentExtractor;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations;
|
||||
import com.knecon.fforesight.service.layoutparser.server.utils.visualizations.PdfDraw;
|
||||
|
||||
import lombok.SneakyThrows;
|
||||
|
||||
class InvisibleTableDetectionServiceTest {
|
||||
|
||||
@Test
|
||||
// @Disabled
|
||||
@SneakyThrows
|
||||
public void detectInvisibleTableTest() {
|
||||
|
||||
String fileName = "files/basf/CustomerFiles/invisible_tables_test-two-pages_ocred.pdf";
|
||||
var tmpFileName = Path.of("/tmp/").resolve(Path.of(fileName).getFileName() + "_TABLE.pdf").toString();
|
||||
List<PageInformation> pageContents = PageContentExtractor.getSortedPageContents(fileName)
|
||||
.stream()
|
||||
.map(PageInformationService::build)
|
||||
.collect(Collectors.toList());
|
||||
|
||||
int pageNumber = 1;
|
||||
Rectangle2D tableBBox = pageContents.get(0).getPageContents().getSortedWords().subList(45, 152)
|
||||
.stream()
|
||||
.map(Word::getBBox)
|
||||
.map(this::mirrorY)
|
||||
.collect(RectangleTransformations.collectBBox());
|
||||
|
||||
List<Word> words = pageContents.get(0).getPageContents().getSortedWords()
|
||||
.stream()
|
||||
.filter(textPositionSequence -> tableBBox.contains(mirrorY(textPositionSequence.getBBox())))
|
||||
.toList();
|
||||
|
||||
var table = InvisibleTableDetectionService.detectTable(words, tableBBox);
|
||||
|
||||
PdfDraw.drawRectanglesPerPage(fileName,
|
||||
List.of(table.stream()
|
||||
.flatMap(Collection::stream)
|
||||
.toList(), Collections.emptyList()),
|
||||
tmpFileName);
|
||||
}
|
||||
|
||||
|
||||
private Rectangle2D mirrorY(Rectangle2D rectangle2D) {
|
||||
|
||||
if (rectangle2D.getHeight() >= 0) {
|
||||
return rectangle2D;
|
||||
}
|
||||
return new Rectangle2D.Double(rectangle2D.getX(), rectangle2D.getY() + rectangle2D.getHeight(), rectangle2D.getWidth(), -rectangle2D.getHeight());
|
||||
}
|
||||
|
||||
}
|
||||
@ -5,6 +5,7 @@ import java.util.List;
|
||||
|
||||
import org.junit.jupiter.api.Disabled;
|
||||
import org.junit.jupiter.api.Test;
|
||||
import org.springframework.core.io.ClassPathResource;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.PageContents;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.PageContentExtractor;
|
||||
@ -20,7 +21,7 @@ class MainBodyTextFrameExtractionServiceTest {
|
||||
|
||||
String fileName = "files/basf/CustomerFiles/invisible_tables_test-two-pages_ocred.pdf";
|
||||
String tmpFileName = Path.of("/tmp/").resolve(Path.of(fileName).getFileName() + "_MAIN_BODY.pdf").toString();
|
||||
List<PageContents> sortedTextPositionSequence = PageContentExtractor.getSortedPageContents(fileName);
|
||||
List<PageContents> sortedTextPositionSequence = PageContentExtractor.getDocumentContents(new ClassPathResource(fileName).getFile(), 4);
|
||||
|
||||
|
||||
}
|
||||
|
||||
@ -4,6 +4,7 @@ import java.nio.file.Path;
|
||||
import java.util.List;
|
||||
|
||||
import org.junit.jupiter.api.Test;
|
||||
import org.springframework.core.io.ClassPathResource;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.PageContents;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.Word;
|
||||
@ -21,11 +22,11 @@ class PageContentExtractorTest {
|
||||
String fileName = "files/syngenta/CustomerFiles/Documine/Flora/SOLICITA_VICTRATO-GOLD-II_Item 21_Mutacao_Genica.pdf";
|
||||
var tmpFileName = Path.of("/tmp/").resolve(Path.of(fileName).getFileName() + "_TEXT_POSITION_SEQUENCES.pdf").toString();
|
||||
|
||||
List<PageContents> textPositionPerPage = PageContentExtractor.getSortedPageContents(fileName);
|
||||
List<PageContents> textPositionPerPage = PageContentExtractor.getDocumentContents(new ClassPathResource(fileName).getFile(), 4);
|
||||
|
||||
PdfDraw.drawRectanglesPerPageNumberedByLine(fileName,
|
||||
textPositionPerPage.stream()
|
||||
.map(t -> t.getSortedWords()
|
||||
.map(t -> t.getWords()
|
||||
.stream()
|
||||
.map(Word::getBBoxPdf)
|
||||
.map(List::of)
|
||||
|
||||
@ -1,63 +0,0 @@
|
||||
package com.knecon.fforesight.service.layoutparser.server.services;
|
||||
|
||||
import java.util.Collection;
|
||||
import java.util.List;
|
||||
|
||||
import org.junit.jupiter.api.Disabled;
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.PageInformation;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.PageInformationService;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.PageContentExtractor;
|
||||
import com.knecon.fforesight.service.layoutparser.server.utils.visualizations.PdfDraw;
|
||||
|
||||
import lombok.SneakyThrows;
|
||||
|
||||
class PageInformationServiceTest {
|
||||
|
||||
@Test
|
||||
@Disabled
|
||||
@SneakyThrows
|
||||
public void testGapDetection() {
|
||||
|
||||
String filename = "files/basf/CustomerFiles/invisible_tables_test-two-pages_ocred.pdf";
|
||||
var tmpFileName = "/tmp/" + filename.split("/")[2] + "_GAPS.pdf";
|
||||
System.out.println("start TextPosition extraction");
|
||||
long start = System.currentTimeMillis();
|
||||
List<PageInformation> pageInformations = PageContentExtractor.getSortedPageContents(filename).stream().map(PageInformationService::build).toList();
|
||||
System.out.printf("Finished TextPosition Extraction in %d ms%n", System.currentTimeMillis() - start);
|
||||
System.out.println("start gap detection");
|
||||
start = System.currentTimeMillis();
|
||||
System.out.printf("Finished gap detection in %d ms%n", System.currentTimeMillis() - start);
|
||||
System.out.println("start draw rectangles");
|
||||
start = System.currentTimeMillis();
|
||||
PdfDraw.drawRectanglesAndLinesPerPage(filename,
|
||||
pageInformations.stream().map(PageInformation::getGapInformation).map(gaps -> gaps.getYGaps().stream().flatMap(Collection::stream).toList()).toList(),
|
||||
pageInformations.stream().map(PageInformation::getGapInformation).map(gaps -> gaps.getXGaps().stream().flatMap(Collection::stream).toList()).toList(),
|
||||
tmpFileName);
|
||||
System.out.printf("Finished drawing rectangles in %d ms%n", System.currentTimeMillis() - start);
|
||||
}
|
||||
|
||||
@Test
|
||||
@Disabled
|
||||
@SneakyThrows
|
||||
public void testLineDetection() {
|
||||
|
||||
String filename = "files/basf/CustomerFiles/invisible_tables_test-two-pages_ocred.pdf";
|
||||
var tmpFileName = "/tmp/" + filename.split("/")[2] + "_GAPS.pdf";
|
||||
System.out.println("start TextPosition extraction");
|
||||
long start = System.currentTimeMillis();
|
||||
List<PageInformation> pageInformations = PageContentExtractor.getSortedPageContents(filename).stream().map(PageInformationService::build).toList();
|
||||
System.out.printf("Finished TextPosition Extraction in %d ms%n", System.currentTimeMillis() - start);
|
||||
System.out.println("start gap detection");
|
||||
start = System.currentTimeMillis();
|
||||
System.out.printf("Finished gap detection in %d ms%n", System.currentTimeMillis() - start);
|
||||
System.out.println("start draw rectangles");
|
||||
start = System.currentTimeMillis();
|
||||
PdfDraw.drawRectanglesPerPageNumberedByLine(filename,
|
||||
pageInformations.stream().map(PageInformation::getLineInformation).map(gaps -> gaps.getBBoxWithGapsByLines().stream().toList()).toList(),
|
||||
tmpFileName);
|
||||
System.out.printf("Finished drawing rectangles in %d ms%n", System.currentTimeMillis() - start);
|
||||
}
|
||||
|
||||
}
|
||||
@ -34,9 +34,10 @@ import com.knecon.fforesight.service.layoutparser.processor.services.PageContent
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.RulingCleaningService;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.factory.DocumentGraphFactory;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.DrawingOptions;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.RectangularIntersectionFinder;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.tables.RectangularIntersectionFinder;
|
||||
import com.knecon.fforesight.service.layoutparser.server.BuildDocumentTest;
|
||||
import com.knecon.fforesight.service.layoutparser.server.utils.visualizations.PdfDraw;
|
||||
import com.knecon.fforesight.service.ocr.v1.api.model.IdpResult;
|
||||
|
||||
import lombok.SneakyThrows;
|
||||
|
||||
@ -49,7 +50,7 @@ public class RulingCleaningServiceTest extends BuildDocumentTest {
|
||||
|
||||
String fileName = "files/syngenta/CustomerFiles/SinglePages/T5_Page16_VV-640252.pdf";
|
||||
String lineFileName = "/tmp/" + Path.of(fileName).getFileName().toString() + "_CELLS.pdf";
|
||||
List<PageContents> pageContents = PageContentExtractor.getSortedPageContents(fileName);
|
||||
List<PageContents> pageContents = PageContentExtractor.getDocumentContents(new ClassPathResource(fileName).getFile(), 4);
|
||||
RulingCleaningService rulingCleaningService = new RulingCleaningService();
|
||||
List<List<Rectangle2D>> rectanglesPerPage = new LinkedList<>();
|
||||
for (PageContents pageContent : pageContents) {
|
||||
@ -69,7 +70,7 @@ public class RulingCleaningServiceTest extends BuildDocumentTest {
|
||||
|
||||
String fileName = "files/syngenta/CustomerFiles/SinglePages/Page35_19 Chlorothalonil RAR 08 Volume 3CA B 6b metabolites Oct 2017.pdf";
|
||||
String lineFileName = "/tmp/" + Path.of(fileName).getFileName().toString() + "_LINES.pdf";
|
||||
List<PageContents> pageContents = PageContentExtractor.getSortedPageContents(fileName);
|
||||
List<PageContents> pageContents = PageContentExtractor.getDocumentContents(new ClassPathResource(fileName).getFile(), 4);
|
||||
RulingCleaningService rulingCleaningService = new RulingCleaningService();
|
||||
List<CleanRulings> cleanRulingsPerPage = new LinkedList<>();
|
||||
for (PageContents pageContent : pageContents) {
|
||||
@ -110,6 +111,7 @@ public class RulingCleaningServiceTest extends BuildDocumentTest {
|
||||
filename.toFile(),
|
||||
new ImageServiceResponse(),
|
||||
new TableServiceResponse(),
|
||||
IdpResult.empty(),
|
||||
new VisualLayoutParsingResponse(),
|
||||
Map.of("file", filename.toFile().toString()))).document();
|
||||
Document documentGraphAfter = DocumentGraphFactory.buildDocumentGraph(LayoutParsingType.REDACT_MANAGER_WITHOUT_DUPLICATE_PARAGRAPH,
|
||||
@ -117,6 +119,7 @@ public class RulingCleaningServiceTest extends BuildDocumentTest {
|
||||
filename.toFile(),
|
||||
new ImageServiceResponse(),
|
||||
new TableServiceResponse(),
|
||||
IdpResult.empty(),
|
||||
new VisualLayoutParsingResponse(),
|
||||
Map.of("file", filename.toFile().toString()))).document();
|
||||
DocumentData documentDataBefore = DocumentDataMapper.toDocumentData(documentGraphBefore);
|
||||
|
||||
@ -7,6 +7,7 @@ import java.util.Collections;
|
||||
import java.util.List;
|
||||
|
||||
import org.junit.jupiter.api.Test;
|
||||
import org.springframework.core.io.ClassPathResource;
|
||||
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.PageContents;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings;
|
||||
@ -15,7 +16,7 @@ import com.knecon.fforesight.service.layoutparser.processor.model.text.Word;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.PageContentExtractor;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.RulingCleaningService;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.TextRulingsClassifier;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.utils.RectangularIntersectionFinder;
|
||||
import com.knecon.fforesight.service.layoutparser.processor.services.tables.RectangularIntersectionFinder;
|
||||
|
||||
import lombok.SneakyThrows;
|
||||
|
||||
@ -26,19 +27,19 @@ public class RulingsClassifierTest {
|
||||
public void textRulingExtractionTest() {
|
||||
|
||||
String fileName = "files/Minimal Examples/RotateTextWithRulingsTestFile.pdf";
|
||||
List<PageContents> pageContents = PageContentExtractor.getSortedPageContents(fileName);
|
||||
List<PageContents> pageContents = PageContentExtractor.getDocumentContents(new ClassPathResource(fileName).getFile(), 4);
|
||||
RulingCleaningService rulingCleaningService = new RulingCleaningService();
|
||||
|
||||
for (PageContents pageContent : pageContents) {
|
||||
CleanRulings cleanRulings = rulingCleaningService.deduplicateAndStraightenRulings(Collections.emptyList(), pageContent.getRulings());
|
||||
RectangularIntersectionFinder.find(cleanRulings.getHorizontals(), cleanRulings.getVerticals());
|
||||
TextRulingsClassifier.classifyUnderlinedAndStrikethroughText(pageContent.getSortedWords(), cleanRulings);
|
||||
TextRulingsClassifier.classifyUnderlinedAndStrikethroughText(pageContent.getWords(), cleanRulings);
|
||||
|
||||
assertTrue(pageContent.getSortedWords()
|
||||
assertTrue(pageContent.getWords()
|
||||
.stream()
|
||||
.filter(word -> word.toString().equals("Underlined"))
|
||||
.allMatch(Word::isUnderline));
|
||||
assertTrue(pageContent.getSortedWords()
|
||||
assertTrue(pageContent.getWords()
|
||||
.stream()
|
||||
.filter(word -> word.toString().equals("Striketrough"))
|
||||
.allMatch(Word::isStrikethrough));
|
||||
@ -64,13 +65,13 @@ public class RulingsClassifierTest {
|
||||
public void tableRulingExtractionTest() {
|
||||
|
||||
String fileName = "files/SinglePages/AbsolutelyEnormousTable.pdf";
|
||||
List<PageContents> pageContents = PageContentExtractor.getSortedPageContents(fileName);
|
||||
List<PageContents> pageContents = PageContentExtractor.getDocumentContents(new ClassPathResource(fileName).getFile(), 4);
|
||||
RulingCleaningService rulingCleaningService = new RulingCleaningService();
|
||||
|
||||
for (PageContents pageContent : pageContents) {
|
||||
CleanRulings cleanRulings = rulingCleaningService.deduplicateAndStraightenRulings(Collections.emptyList(), pageContent.getRulings());
|
||||
RectangularIntersectionFinder.find(cleanRulings.getHorizontals(), cleanRulings.getVerticals());
|
||||
TextRulingsClassifier.classifyUnderlinedAndStrikethroughText(pageContent.getSortedWords(), cleanRulings);
|
||||
TextRulingsClassifier.classifyUnderlinedAndStrikethroughText(pageContent.getWords(), cleanRulings);
|
||||
|
||||
assertEquals(30, cleanRulings.getHorizontals().size());
|
||||
assertEquals(30, cleanRulings.getTableLines().getHorizontals().size());
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Loading…
x
Reference in New Issue
Block a user