Merge branch 'main' into RED-7074
# Conflicts: # layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/LayoutParsingPipeline.java # layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/model/text/TextPageBlock.java # layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/SectionsBuilderService.java # layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/TableExtractionService.java # layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/blockification/DocstrumBlockificationService.java # layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/classification/DocuMineClassificationService.java # layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/factory/DocumentGraphFactory.java # layoutparser-service/layoutparser-service-processor/src/main/java/com/knecon/fforesight/service/layoutparser/processor/services/factory/SectionNodeFactory.java # layoutparser-service/layoutparser-service-server/src/test/java/com/knecon/fforesight/service/layoutparser/server/HeadlinesGoldStandardIntegrationTest.java
This commit is contained in:
commit
61c90fc30d
1
.gitattributes
vendored
Normal file
1
.gitattributes
vendored
Normal file
@ -0,0 +1 @@
|
|||||||
|
*.pdf filter=lfs diff=lfs merge=lfs -text
|
||||||
4
.gitmodules
vendored
4
.gitmodules
vendored
@ -1,8 +1,8 @@
|
|||||||
[submodule "layoutparser-service/layoutparser-service-server/src/test/resources/files/basf"]
|
[submodule "layoutparser-service/layoutparser-service-server/src/test/resources/files/basf"]
|
||||||
path = layoutparser-service/layoutparser-service-server/src/test/resources/files/basf
|
path = layoutparser-service/layoutparser-service-server/src/test/resources/files/basf
|
||||||
url = https://gitlab.knecon.com/fforesight/documents/basf.git
|
url = ssh://git@git.knecon.com:22222/fforesight/documents/basf.git
|
||||||
update = merge
|
update = merge
|
||||||
[submodule "layoutparser-service/layoutparser-service-server/src/test/resources/files/syngenta"]
|
[submodule "layoutparser-service/layoutparser-service-server/src/test/resources/files/syngenta"]
|
||||||
path = layoutparser-service/layoutparser-service-server/src/test/resources/files/syngenta
|
path = layoutparser-service/layoutparser-service-server/src/test/resources/files/syngenta
|
||||||
url = https://gitlab.knecon.com/fforesight/documents/syngenta.git
|
url = ssh://git@git.knecon.com:22222/fforesight/documents/syngenta.git
|
||||||
update = merge
|
update = merge
|
||||||
|
|||||||
@ -5,6 +5,7 @@ public enum LayoutParsingType {
|
|||||||
REDACT_MANAGER_OLD,
|
REDACT_MANAGER_OLD,
|
||||||
REDACT_MANAGER_PARAGRAPH_DEBUG,
|
REDACT_MANAGER_PARAGRAPH_DEBUG,
|
||||||
DOCUMINE,
|
DOCUMINE,
|
||||||
|
DOCUMINE_OLD,
|
||||||
CLARIFYND,
|
CLARIFYND,
|
||||||
CLARIFYND_PARAGRAPH_DEBUG
|
CLARIFYND_PARAGRAPH_DEBUG
|
||||||
}
|
}
|
||||||
|
|||||||
@ -52,6 +52,7 @@ import com.knecon.fforesight.service.layoutparser.processor.services.SectionsBui
|
|||||||
import com.knecon.fforesight.service.layoutparser.processor.services.SimplifiedSectionTextService;
|
import com.knecon.fforesight.service.layoutparser.processor.services.SimplifiedSectionTextService;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.services.TableExtractionService;
|
import com.knecon.fforesight.service.layoutparser.processor.services.TableExtractionService;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.services.blockification.BlockificationPostprocessingService;
|
import com.knecon.fforesight.service.layoutparser.processor.services.blockification.BlockificationPostprocessingService;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.services.TextRulingsClassifier;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.services.blockification.DocstrumBlockificationService;
|
import com.knecon.fforesight.service.layoutparser.processor.services.blockification.DocstrumBlockificationService;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.services.blockification.DocuMineBlockificationService;
|
import com.knecon.fforesight.service.layoutparser.processor.services.blockification.DocuMineBlockificationService;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.services.blockification.RedactManagerBlockificationService;
|
import com.knecon.fforesight.service.layoutparser.processor.services.blockification.RedactManagerBlockificationService;
|
||||||
@ -59,12 +60,14 @@ import com.knecon.fforesight.service.layoutparser.processor.services.classificat
|
|||||||
import com.knecon.fforesight.service.layoutparser.processor.services.classification.DocuMineClassificationService;
|
import com.knecon.fforesight.service.layoutparser.processor.services.classification.DocuMineClassificationService;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.services.classification.RedactManagerClassificationService;
|
import com.knecon.fforesight.service.layoutparser.processor.services.classification.RedactManagerClassificationService;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.services.factory.DocumentGraphFactory;
|
import com.knecon.fforesight.service.layoutparser.processor.services.factory.DocumentGraphFactory;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.services.graphics.Box;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.services.graphics.GraphicExtractorService;
|
import com.knecon.fforesight.service.layoutparser.processor.services.graphics.GraphicExtractorService;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.services.mapper.DocumentDataMapper;
|
import com.knecon.fforesight.service.layoutparser.processor.services.mapper.DocumentDataMapper;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.services.mapper.TaasDocumentDataMapper;
|
import com.knecon.fforesight.service.layoutparser.processor.services.mapper.TaasDocumentDataMapper;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.services.parsing.PDFLinesTextStripper;
|
import com.knecon.fforesight.service.layoutparser.processor.services.parsing.PDFLinesTextStripper;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.services.visualization.LayoutGridService;
|
import com.knecon.fforesight.service.layoutparser.processor.services.visualization.LayoutGridService;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.utils.MarkedContentUtils;
|
import com.knecon.fforesight.service.layoutparser.processor.utils.MarkedContentUtils;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.utils.PageInformation;
|
||||||
|
|
||||||
import io.micrometer.observation.Observation;
|
import io.micrometer.observation.Observation;
|
||||||
import io.micrometer.observation.ObservationRegistry;
|
import io.micrometer.observation.ObservationRegistry;
|
||||||
@ -104,6 +107,7 @@ public class LayoutParsingPipeline {
|
|||||||
OutlineExtractorService outlineExtractorService;
|
OutlineExtractorService outlineExtractorService;
|
||||||
OutlineValidationService outlineValidationService;
|
OutlineValidationService outlineValidationService;
|
||||||
TOCEnrichmentService tocEnrichmentService;
|
TOCEnrichmentService tocEnrichmentService;
|
||||||
|
LayoutparserSettings settings;
|
||||||
|
|
||||||
|
|
||||||
public LayoutParsingFinishedEvent parseLayoutAndSaveFilesToStorage(LayoutParsingRequest layoutParsingRequest) throws IOException {
|
public LayoutParsingFinishedEvent parseLayoutAndSaveFilesToStorage(LayoutParsingRequest layoutParsingRequest) throws IOException {
|
||||||
@ -136,7 +140,8 @@ public class LayoutParsingPipeline {
|
|||||||
.get());
|
.get());
|
||||||
}
|
}
|
||||||
|
|
||||||
ClassificationDocument classificationDocument = parseLayout(layoutParsingRequest.layoutParsingType(),
|
ClassificationDocument classificationDocument = parseLayout(settings.getLayoutParsingTypeOverride() == null //
|
||||||
|
? layoutParsingRequest.layoutParsingType() : settings.getLayoutParsingTypeOverride(),
|
||||||
originFile,
|
originFile,
|
||||||
imageServiceResponse,
|
imageServiceResponse,
|
||||||
tableServiceResponse,
|
tableServiceResponse,
|
||||||
@ -145,11 +150,12 @@ public class LayoutParsingPipeline {
|
|||||||
|
|
||||||
log.info("Building document graph for {}", layoutParsingRequest.identifier());
|
log.info("Building document graph for {}", layoutParsingRequest.identifier());
|
||||||
|
|
||||||
Document documentGraph = observeBuildDocumentGraph(layoutParsingRequest.layoutParsingType(), classificationDocument);
|
Document documentGraph = observeBuildDocumentGraph(settings.getLayoutParsingTypeOverride() == null //
|
||||||
|
? layoutParsingRequest.layoutParsingType() : settings.getLayoutParsingTypeOverride(), classificationDocument);
|
||||||
|
|
||||||
log.info("Creating viewer document for {}", layoutParsingRequest.identifier());
|
log.info("Creating viewer document for {}", layoutParsingRequest.identifier());
|
||||||
|
|
||||||
layoutGridService.addLayoutGrid(viewerDocumentFile, documentGraph, viewerDocumentFile, false);
|
layoutGridService.addLayoutGrid(viewerDocumentFile, documentGraph, viewerDocumentFile, false, layoutParsingRequest.visualLayoutParsingFileId().isPresent());
|
||||||
|
|
||||||
log.info("Storing resulting files for {}", layoutParsingRequest.identifier());
|
log.info("Storing resulting files for {}", layoutParsingRequest.identifier());
|
||||||
|
|
||||||
@ -239,6 +245,11 @@ public class LayoutParsingPipeline {
|
|||||||
Map<Integer, List<ClassifiedImage>> pdfImages = imageServiceResponseAdapter.buildClassifiedImagesPerPage(imageServiceResponse);
|
Map<Integer, List<ClassifiedImage>> pdfImages = imageServiceResponseAdapter.buildClassifiedImagesPerPage(imageServiceResponse);
|
||||||
Map<Integer, List<ClassifiedImage>> signatures = visualLayoutParsingAdapter.buildExtractedSignaturesPerPage(visualLayoutParsingResponse);
|
Map<Integer, List<ClassifiedImage>> signatures = visualLayoutParsingAdapter.buildExtractedSignaturesPerPage(visualLayoutParsingResponse);
|
||||||
ClassificationDocument classificationDocument = new ClassificationDocument();
|
ClassificationDocument classificationDocument = new ClassificationDocument();
|
||||||
|
|
||||||
|
if (settings.isDebug() || identifier.containsKey("debug")) {
|
||||||
|
classificationDocument.getVisualizations().setActive(true);
|
||||||
|
}
|
||||||
|
|
||||||
List<ClassificationPage> classificationPages = new ArrayList<>();
|
List<ClassificationPage> classificationPages = new ArrayList<>();
|
||||||
OutlineObject lastProcessedOutlineObject = null;
|
OutlineObject lastProcessedOutlineObject = null;
|
||||||
|
|
||||||
@ -267,10 +278,12 @@ public class LayoutParsingPipeline {
|
|||||||
stripper.setStartPage(pageNumber);
|
stripper.setStartPage(pageNumber);
|
||||||
stripper.setEndPage(pageNumber);
|
stripper.setEndPage(pageNumber);
|
||||||
stripper.setPdpage(pdPage);
|
stripper.setPdpage(pdPage);
|
||||||
if (layoutParsingType.equals(LayoutParsingType.DOCUMINE)) {
|
if (layoutParsingType.equals(LayoutParsingType.DOCUMINE_OLD)) {
|
||||||
stripper.setSortByPosition(true);
|
stripper.setSortByPosition(true);
|
||||||
}
|
}
|
||||||
stripper.getText(originDocument);
|
stripper.getText(originDocument);
|
||||||
|
List<TextPositionSequence> words = stripper.getTextPositionSequences();
|
||||||
|
classificationDocument.getVisualizations().addTextVisualizations(words, pageNumber);
|
||||||
|
|
||||||
PDRectangle pdr = pdPage.getMediaBox();
|
PDRectangle pdr = pdPage.getMediaBox();
|
||||||
|
|
||||||
@ -278,16 +291,21 @@ public class LayoutParsingPipeline {
|
|||||||
boolean isLandscape = pdr.getWidth() > pdr.getHeight() && (rotation == 0 || rotation == 180) || pdr.getHeight() > pdr.getWidth() && (rotation == 90 || rotation == 270);
|
boolean isLandscape = pdr.getWidth() > pdr.getHeight() && (rotation == 0 || rotation == 180) || pdr.getHeight() > pdr.getWidth() && (rotation == 90 || rotation == 270);
|
||||||
|
|
||||||
PDRectangle cropbox = pdPage.getCropBox();
|
PDRectangle cropbox = pdPage.getCropBox();
|
||||||
CleanRulings cleanRulings = rulingCleaningService.getCleanRulings(pdfTableCells.get(pageNumber), stripper.getRulings());
|
classificationDocument.getVisualizations().addRulingVisualization(stripper.getRulings(), pageNumber);
|
||||||
|
CleanRulings cleanRulings = rulingCleaningService.deduplicateAndStraightenRulings(pdfTableCells.get(pageNumber), stripper.getRulings());
|
||||||
|
|
||||||
List<Cell> emptyTableCells = TableExtractionService.findCells(cleanRulings.getHorizontal(), cleanRulings.getVertical());
|
PageInformation pageInformation = PageInformation.fromPDPage(pageNumber, pdPage);
|
||||||
|
List<Cell> emptyTableCells = TableExtractionService.findCells(cleanRulings.getHorizontals(), cleanRulings.getVerticals(), pageInformation);
|
||||||
|
classificationDocument.getVisualizations().addCellVisualizations(emptyTableCells, pageNumber);
|
||||||
|
|
||||||
var graphics = graphicExtractorService.extractPathElementGraphics(originDocument,
|
TextRulingsClassifier.classifyUnderlinedAndStrikethroughText(words, cleanRulings);
|
||||||
|
|
||||||
|
List<Box> graphics = graphicExtractorService.extractPathElementGraphics(originDocument,
|
||||||
pdPage,
|
pdPage,
|
||||||
pageNumber,
|
pageNumber,
|
||||||
cleanRulings,
|
cleanRulings,
|
||||||
stripper.getTextPositionSequences(),
|
stripper.getTextPositionSequences(),
|
||||||
emptyTableCells,
|
|
||||||
false);
|
false);
|
||||||
|
|
||||||
pdfImages.computeIfAbsent(pageNumber, x -> new ArrayList<>())
|
pdfImages.computeIfAbsent(pageNumber, x -> new ArrayList<>())
|
||||||
@ -296,10 +314,13 @@ public class LayoutParsingPipeline {
|
|||||||
.toList());
|
.toList());
|
||||||
|
|
||||||
ClassificationPage classificationPage = switch (layoutParsingType) {
|
ClassificationPage classificationPage = switch (layoutParsingType) {
|
||||||
case REDACT_MANAGER_OLD -> redactManagerBlockificationService.blockify(stripper.getTextPositionSequences(), emptyTableCells);
|
case REDACT_MANAGER_OLD ->
|
||||||
case DOCUMINE -> docuMineBlockificationService.blockify(stripper.getTextPositionSequences(), cleanRulings.getHorizontal(), cleanRulings.getVertical());
|
redactManagerBlockificationService.blockify(stripper.getTextPositionSequences(), cleanRulings, classificationDocument.getVisualizations());
|
||||||
case REDACT_MANAGER, REDACT_MANAGER_PARAGRAPH_DEBUG -> docstrumBlockificationService.blockify(stripper.getTextPositionSequences(), emptyTableCells, true);
|
case DOCUMINE_OLD -> docuMineBlockificationService.blockify(words, cleanRulings);
|
||||||
case CLARIFYND, CLARIFYND_PARAGRAPH_DEBUG -> docstrumBlockificationService.blockify(stripper.getTextPositionSequences(), emptyTableCells, false);
|
case DOCUMINE, REDACT_MANAGER, REDACT_MANAGER_PARAGRAPH_DEBUG ->
|
||||||
|
docstrumBlockificationService.blockify(words, cleanRulings, true, classificationDocument.getVisualizations(), layoutParsingType);
|
||||||
|
case CLARIFYND, CLARIFYND_PARAGRAPH_DEBUG ->
|
||||||
|
docstrumBlockificationService.blockify(words, cleanRulings, false, classificationDocument.getVisualizations(), layoutParsingType);
|
||||||
};
|
};
|
||||||
|
|
||||||
classificationPage.setCleanRulings(cleanRulings);
|
classificationPage.setCleanRulings(cleanRulings);
|
||||||
@ -321,11 +342,12 @@ public class LayoutParsingPipeline {
|
|||||||
lastProcessedOutlineObject = blockificationPostprocessingService.sanitizeOutlineBlocks(classificationPage, notFoundOutlineObject);
|
lastProcessedOutlineObject = blockificationPostprocessingService.sanitizeOutlineBlocks(classificationPage, notFoundOutlineObject);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
classificationDocument.getVisualizations().addMarkedContentVisualizations(stripper.getMarkedContents(), pageNumber);
|
||||||
// MarkedContent needs to be converted at this point, otherwise it leads to GC Problems in Pdfbox.
|
// MarkedContent needs to be converted at this point, otherwise it leads to GC Problems in Pdfbox.
|
||||||
classificationPage.setMarkedContentBboxPerType(convertMarkedContents(stripper.getMarkedContents()));
|
classificationPage.setMarkedContentBboxPerType(convertMarkedContents(stripper.getMarkedContents()));
|
||||||
|
|
||||||
// If images is ocr needs to be calculated before textBlocks are moved into tables, otherwise findOcr algorithm needs to be adopted.
|
// If images is ocr needs to be calculated before textBlocks are moved into tables, otherwise findOcr algorithm needs to be adopted.
|
||||||
if (pdfImages != null && pdfImages.containsKey(pageNumber)) {
|
if (pdfImages.containsKey(pageNumber)) {
|
||||||
classificationPage.setImages(pdfImages.get(pageNumber));
|
classificationPage.setImages(pdfImages.get(pageNumber));
|
||||||
imageServiceResponseAdapter.findOcr(classificationPage);
|
imageServiceResponseAdapter.findOcr(classificationPage);
|
||||||
}
|
}
|
||||||
@ -340,12 +362,6 @@ public class LayoutParsingPipeline {
|
|||||||
|
|
||||||
tableExtractionService.extractTables(emptyTableCells, classificationPage);
|
tableExtractionService.extractTables(emptyTableCells, classificationPage);
|
||||||
|
|
||||||
if (layoutParsingType == LayoutParsingType.REDACT_MANAGER) {
|
|
||||||
docstrumBlockificationService.combineBlocks(classificationPage);
|
|
||||||
} else if (layoutParsingType == LayoutParsingType.CLARIFYND) {
|
|
||||||
docstrumBlockificationService.mergeIntersectingBlocks(classificationPage.getTextBlocks(), 0, 6.5f);
|
|
||||||
}
|
|
||||||
|
|
||||||
buildPageStatistics(classificationPage);
|
buildPageStatistics(classificationPage);
|
||||||
increaseDocumentStatistics(classificationPage, classificationDocument);
|
increaseDocumentStatistics(classificationPage, classificationDocument);
|
||||||
|
|
||||||
@ -356,11 +372,14 @@ public class LayoutParsingPipeline {
|
|||||||
|
|
||||||
log.info("Calculating BodyTextFrame for {}", identifier);
|
log.info("Calculating BodyTextFrame for {}", identifier);
|
||||||
bodyTextFrameService.setBodyTextFrames(classificationDocument, layoutParsingType);
|
bodyTextFrameService.setBodyTextFrames(classificationDocument, layoutParsingType);
|
||||||
|
for (ClassificationPage page : classificationDocument.getPages()) {
|
||||||
|
classificationDocument.getVisualizations().addCleanRulingVisualization(page.getCleanRulings(), page.getPageNumber());
|
||||||
|
}
|
||||||
log.info("Classify TextBlocks for {}", identifier);
|
log.info("Classify TextBlocks for {}", identifier);
|
||||||
switch (layoutParsingType) {
|
switch (layoutParsingType) {
|
||||||
case REDACT_MANAGER, REDACT_MANAGER_PARAGRAPH_DEBUG, REDACT_MANAGER_OLD, CLARIFYND_PARAGRAPH_DEBUG ->
|
case REDACT_MANAGER, REDACT_MANAGER_PARAGRAPH_DEBUG, REDACT_MANAGER_OLD, CLARIFYND_PARAGRAPH_DEBUG ->
|
||||||
redactManagerClassificationService.classifyDocument(classificationDocument);
|
redactManagerClassificationService.classifyDocument(classificationDocument);
|
||||||
case DOCUMINE -> docuMineClassificationService.classifyDocument(classificationDocument);
|
case DOCUMINE_OLD, DOCUMINE -> docuMineClassificationService.classifyDocument(classificationDocument);
|
||||||
case CLARIFYND -> clarifyndClassificationService.classifyDocument(classificationDocument);
|
case CLARIFYND -> clarifyndClassificationService.classifyDocument(classificationDocument);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@ -0,0 +1,20 @@
|
|||||||
|
package com.knecon.fforesight.service.layoutparser.processor;
|
||||||
|
|
||||||
|
import org.springframework.boot.context.properties.ConfigurationProperties;
|
||||||
|
import org.springframework.context.annotation.Configuration;
|
||||||
|
|
||||||
|
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType;
|
||||||
|
|
||||||
|
import lombok.AccessLevel;
|
||||||
|
import lombok.Data;
|
||||||
|
import lombok.experimental.FieldDefaults;
|
||||||
|
|
||||||
|
@Data
|
||||||
|
@Configuration
|
||||||
|
@ConfigurationProperties("layoutparser")
|
||||||
|
@FieldDefaults(level = AccessLevel.PRIVATE)
|
||||||
|
public class LayoutparserSettings {
|
||||||
|
|
||||||
|
boolean debug;
|
||||||
|
LayoutParsingType layoutParsingTypeOverride;
|
||||||
|
}
|
||||||
@ -7,14 +7,18 @@ import java.util.stream.Collectors;
|
|||||||
import org.springframework.stereotype.Service;
|
import org.springframework.stereotype.Service;
|
||||||
|
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.Character;
|
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.Character;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.Line;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.Zone;
|
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.Zone;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.docstrum.service.LineBuilderService;
|
import com.knecon.fforesight.service.layoutparser.processor.docstrum.service.LineBuilderService;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.docstrum.service.NearestNeighbourService;
|
import com.knecon.fforesight.service.layoutparser.processor.docstrum.service.NearestNeighbourService;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.docstrum.service.ReadingOrderService;
|
import com.knecon.fforesight.service.layoutparser.processor.docstrum.service.ReadingOrderService;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.docstrum.service.SpacingService;
|
import com.knecon.fforesight.service.layoutparser.processor.docstrum.service.SpacingService;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.docstrum.service.ZoneBuilderService;
|
import com.knecon.fforesight.service.layoutparser.processor.docstrum.service.ZoneBuilderService;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.model.text.RedTextPosition;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextDirection;
|
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextDirection;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
|
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.visualization.LayoutparsingVisualizations;
|
||||||
|
|
||||||
import lombok.RequiredArgsConstructor;
|
import lombok.RequiredArgsConstructor;
|
||||||
|
|
||||||
@ -29,31 +33,37 @@ public class DocstrumSegmentationService {
|
|||||||
private final ReadingOrderService readingOrderService;
|
private final ReadingOrderService readingOrderService;
|
||||||
|
|
||||||
|
|
||||||
public List<Zone> segmentPage(List<TextPositionSequence> textPositions, boolean xyOrder) {
|
public List<Zone> segmentPage(List<TextPositionSequence> textPositions, boolean xyOrder, CleanRulings usedRulings, LayoutparsingVisualizations visualizations) {
|
||||||
|
|
||||||
List<Zone> zones = new ArrayList<>();
|
List<Zone> zones = new ArrayList<>();
|
||||||
zones.addAll(computeZones(textPositions, TextDirection.ZERO));
|
zones.addAll(computeZones(textPositions, usedRulings, visualizations, TextDirection.ZERO));
|
||||||
zones.addAll(computeZones(textPositions, TextDirection.QUARTER_CIRCLE));
|
zones.addAll(computeZones(textPositions, usedRulings, visualizations, TextDirection.QUARTER_CIRCLE));
|
||||||
zones.addAll(computeZones(textPositions, TextDirection.HALF_CIRCLE));
|
zones.addAll(computeZones(textPositions, usedRulings, visualizations, TextDirection.HALF_CIRCLE));
|
||||||
zones.addAll(computeZones(textPositions, TextDirection.THREE_QUARTER_CIRCLE));
|
zones.addAll(computeZones(textPositions, usedRulings, visualizations, TextDirection.THREE_QUARTER_CIRCLE));
|
||||||
|
|
||||||
return readingOrderService.resolve(zones, xyOrder);
|
return readingOrderService.resolve(zones, xyOrder);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
private List<Zone> computeZones(List<TextPositionSequence> textPositions, TextDirection direction) {
|
private List<Zone> computeZones(List<TextPositionSequence> textPositions, CleanRulings rulings, LayoutparsingVisualizations visualizations, TextDirection direction) {
|
||||||
|
|
||||||
var positions = textPositions.stream().filter(t -> t.getDir() == direction).map(TextPositionSequence::getTextPositions).flatMap(List::stream).toList();
|
List<RedTextPosition> positions = textPositions.stream()
|
||||||
|
.filter(t -> t.getDir() == direction)
|
||||||
|
.map(TextPositionSequence::getTextPositions)
|
||||||
|
.flatMap(List::stream)
|
||||||
|
.toList();
|
||||||
|
|
||||||
var characters = positions.stream().map(Character::new).collect(Collectors.toList());
|
List<Character> characters = positions.stream()
|
||||||
|
.map(Character::new)
|
||||||
|
.collect(Collectors.toList());
|
||||||
|
|
||||||
nearestNeighbourService.findNearestNeighbors(characters);
|
nearestNeighbourService.findNearestNeighbors(characters);
|
||||||
|
|
||||||
var characterSpacing = spacingService.computeCharacterSpacing(characters);
|
double characterSpacing = spacingService.computeCharacterSpacing(characters);
|
||||||
var lineSpacing = Math.min(spacingService.computeLineSpacing(characters), 20);
|
double lineSpacing = Math.min(spacingService.computeLineSpacing(characters), 20);
|
||||||
|
|
||||||
var lines = lineBuilderService.buildLines(characters, characterSpacing, lineSpacing);
|
List<Line> lines = lineBuilderService.buildLines(characters, characterSpacing, lineSpacing, rulings);
|
||||||
return zoneBuilderService.buildZones(lines, characterSpacing, lineSpacing);
|
return zoneBuilderService.buildZones(lines, characterSpacing, lineSpacing, rulings);
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|||||||
@ -1,13 +1,27 @@
|
|||||||
package com.knecon.fforesight.service.layoutparser.processor.docstrum.model;
|
package com.knecon.fforesight.service.layoutparser.processor.docstrum.model;
|
||||||
|
|
||||||
import java.awt.geom.Rectangle2D;
|
import java.awt.geom.Rectangle2D;
|
||||||
|
import java.util.Comparator;
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations;
|
||||||
|
|
||||||
import lombok.Data;
|
import lombok.Data;
|
||||||
|
|
||||||
@Data
|
@Data
|
||||||
public abstract class BoundingBox {
|
public abstract class BoundingBox {
|
||||||
|
|
||||||
private Rectangle2D bBox;
|
// Java coordinate system: (0, 0) is always upper left, x is increasing left to right and y is increasing from top to bottom.
|
||||||
|
// should be used when determining reading order or other tasks which require coordinates in a harmonized system.
|
||||||
|
protected Rectangle2D bBox; // I would not trust this coordinate when comparing rulings and text, due to the text positions being slightly off.
|
||||||
|
|
||||||
|
// PDF coordinate system: depends on page rotation, (0, 0) is lower left corner, x is increasing left to right and y from bottom to top.
|
||||||
|
// This rotates completely in 90 degree steps with page rotation.
|
||||||
|
// Needs to be used when writing to a PDF.
|
||||||
|
// Also, these are definitely correct and should be used whenever possible.
|
||||||
|
protected Rectangle2D bBoxInitialUserSpace;
|
||||||
|
|
||||||
|
protected static final float VERTICAL_COMPARISON_THRESHOLD = 0.4f;
|
||||||
|
|
||||||
|
|
||||||
public double getX() {
|
public double getX() {
|
||||||
@ -22,6 +36,42 @@ public abstract class BoundingBox {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public double getMinX() {
|
||||||
|
|
||||||
|
return bBox.getMinX();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public double getMinY() {
|
||||||
|
|
||||||
|
return bBox.getMinY();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public double getPdfMinX() {
|
||||||
|
|
||||||
|
return bBoxInitialUserSpace.getMinX();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public double getPdfMaxX() {
|
||||||
|
|
||||||
|
return bBoxInitialUserSpace.getMaxX();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public double getPdfMinY() {
|
||||||
|
|
||||||
|
return bBoxInitialUserSpace.getMinY();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public double getPdfMaxY() {
|
||||||
|
|
||||||
|
return bBoxInitialUserSpace.getMaxY();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
public double getWidth() {
|
public double getWidth() {
|
||||||
|
|
||||||
return bBox.getWidth();
|
return bBox.getWidth();
|
||||||
@ -34,21 +84,170 @@ public abstract class BoundingBox {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public double getMaxX() {
|
||||||
|
|
||||||
|
return bBox.getMaxX();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public double getMaxY() {
|
||||||
|
|
||||||
|
return bBox.getMaxY();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
public double getArea() {
|
public double getArea() {
|
||||||
|
|
||||||
return (bBox.getHeight() * bBox.getWidth());
|
return (bBox.getHeight() * bBox.getWidth());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
public boolean contains(Rectangle2D contained, double tolerance) {
|
public boolean contains(BoundingBox contained) {
|
||||||
|
|
||||||
return bBox.getX() <= contained.getX() + tolerance && bBox.getY() <= contained.getY() + tolerance && bBox.getX() + bBox.getWidth() >= contained.getX() + contained.getWidth() - tolerance && bBox.getY() + bBox.getHeight() >= contained.getY() + contained.getHeight() - tolerance;
|
return contains(contained, 0);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public boolean contains(BoundingBox contained, double tolerance) {
|
||||||
|
|
||||||
|
return getPdfMinX() <= contained.getPdfMinX() + tolerance
|
||||||
|
&& getPdfMinY() <= contained.getPdfMinY() + tolerance
|
||||||
|
&& getPdfMaxX() >= contained.getPdfMaxX() - tolerance
|
||||||
|
&& getPdfMaxY() >= contained.getPdfMaxY() - tolerance;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public boolean intersects(BoundingBox other) {
|
||||||
|
|
||||||
|
return this.intersectsX(other) && this.intersectsY(other);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public boolean intersects(BoundingBox other, float yThreshold, float xThreshold) {
|
||||||
|
|
||||||
|
return this.intersectsX(other, xThreshold) && this.intersectsY(other, yThreshold);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
public boolean intersectsY(BoundingBox other) {
|
public boolean intersectsY(BoundingBox other) {
|
||||||
|
|
||||||
return this.getBBox().getMinY() <= other.getBBox().getMaxY() && this.getBBox().getMaxY() >= other.getBBox().getMinY();
|
return this.getPdfMinY() <= other.getPdfMaxY() && this.getPdfMaxY() >= other.getPdfMinY();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public boolean intersectsYJava(BoundingBox other) {
|
||||||
|
|
||||||
|
return this.getY() <= other.getMaxY() && this.getMaxY() >= other.getY();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public boolean intersectsY(BoundingBox other, float threshold) {
|
||||||
|
|
||||||
|
return this.getPdfMinY() - threshold <= other.getPdfMaxY() && this.getPdfMaxY() + threshold >= other.getPdfMinY();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public boolean intersectsX(BoundingBox other) {
|
||||||
|
|
||||||
|
return this.getPdfMinX() <= other.getPdfMaxX() && this.getPdfMaxX() >= other.getPdfMinX();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public boolean intersectsXJava(BoundingBox other) {
|
||||||
|
|
||||||
|
return this.getX() <= other.getMaxX() && this.getMaxX() >= other.getMinX();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public boolean intersectsX(BoundingBox other, float threshold) {
|
||||||
|
|
||||||
|
return this.getPdfMinX() - threshold <= other.getMaxX() && this.getMaxX() + threshold >= other.getPdfMinX();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public void setToBBoxOfComponents(List<? extends BoundingBox> components) {
|
||||||
|
|
||||||
|
this.bBox = components.stream()
|
||||||
|
.map(BoundingBox::getBBox)
|
||||||
|
.collect(RectangleTransformations.collectBBox());
|
||||||
|
this.bBoxInitialUserSpace = components.stream()
|
||||||
|
.map(BoundingBox::getBBoxInitialUserSpace)
|
||||||
|
.collect(RectangleTransformations.collectBBox());
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public double verticalOverlap(BoundingBox other) {
|
||||||
|
|
||||||
|
return Math.max(0, Math.min(this.getPdfMaxY(), other.getPdfMaxY()) - Math.max(this.getPdfMinY(), other.getPdfMinY()));
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public static final Comparator<BoundingBox> ILL_DEFINED_ORDER = (o1, o2) -> {
|
||||||
|
|
||||||
|
if (o1.equals(o2)) {
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
if (o1.verticalOverlap(o2) > VERTICAL_COMPARISON_THRESHOLD * ((o1.getHeight() + o2.getHeight()) / 2)) {
|
||||||
|
return Double.compare(o1.getPdfMinX(), o2.getPdfMinX());
|
||||||
|
} else {
|
||||||
|
return Double.compare(o1.getPdfMaxY(), o2.getPdfMaxY());
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
|
||||||
|
public double horizontalDistance(BoundingBox other) {
|
||||||
|
|
||||||
|
Rectangle2D left;
|
||||||
|
Rectangle2D right;
|
||||||
|
if (this.leftOf(other)) {
|
||||||
|
left = this.getBBox();
|
||||||
|
right = other.getBBox();
|
||||||
|
} else {
|
||||||
|
left = other.getBBox();
|
||||||
|
right = this.getBBox();
|
||||||
|
}
|
||||||
|
|
||||||
|
return Math.max(0, right.getMinX() - left.getMaxX());
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public double verticalDistance(BoundingBox other) {
|
||||||
|
|
||||||
|
Rectangle2D bottom;
|
||||||
|
Rectangle2D top;
|
||||||
|
if (this.isAbove(other)) {
|
||||||
|
top = this.getBBox();
|
||||||
|
bottom = other.getBBox();
|
||||||
|
} else {
|
||||||
|
bottom = this.getBBox();
|
||||||
|
top = other.getBBox();
|
||||||
|
}
|
||||||
|
|
||||||
|
return Math.max(0, bottom.getMinY() - top.getMaxY());
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public boolean rightOf(BoundingBox other) {
|
||||||
|
|
||||||
|
return this.intersectsYJava(other) && other.getMaxX() <= this.getMinX();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public boolean leftOf(BoundingBox other) {
|
||||||
|
|
||||||
|
return this.intersectsYJava(other) && other.getMinX() >= this.getMaxX();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public boolean isAbove(BoundingBox other) {
|
||||||
|
|
||||||
|
return this.intersectsXJava(other) && other.getMinY() >= this.getMaxY();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public boolean isBelow(BoundingBox other) {
|
||||||
|
|
||||||
|
return this.intersectsXJava(other) && this.getMinY() >= other.getMaxY();
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|||||||
@ -27,8 +27,8 @@ public class Character {
|
|||||||
|
|
||||||
public Character(RedTextPosition chunk) {
|
public Character(RedTextPosition chunk) {
|
||||||
|
|
||||||
this.x = chunk.getXDirAdj() + chunk.getWidthDirAdj() / 2;
|
this.x = chunk.getBBoxDirAdj().getCenterX();
|
||||||
this.y = chunk.getYDirAdj() + chunk.getHeightDir() / 2;
|
this.y = chunk.getBBoxDirAdj().getCenterY();
|
||||||
this.textPosition = chunk;
|
this.textPosition = chunk;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@ -0,0 +1,324 @@
|
|||||||
|
package com.knecon.fforesight.service.layoutparser.processor.docstrum.model;
|
||||||
|
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.Collections;
|
||||||
|
import java.util.Comparator;
|
||||||
|
import java.util.HashSet;
|
||||||
|
import java.util.LinkedList;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.Set;
|
||||||
|
|
||||||
|
|
||||||
|
/*
|
||||||
|
WIP, mostly working, needs to be tested a bit more
|
||||||
|
*/
|
||||||
|
public class ColumnDetector {
|
||||||
|
|
||||||
|
public static final double MAX_VALUE_THRESHOLD = 0.5;
|
||||||
|
final static int bins_num = 512;
|
||||||
|
final static int globalStartIdx = 0; // ignore outer parts completely, we don't expect columns there
|
||||||
|
final static int globalEndIdx = bins_num; // i chose 7, since thirds seems a likely split for columns, therefore divided by 6 would eliminate those.
|
||||||
|
public static final double DERIVATIVE_ZERO_THRESHOLD = 1e-10;
|
||||||
|
public static final double MINIMUM_THRESHOLD_FOR_COLUMNS = 0.05;
|
||||||
|
public static final double NEAR_GLOBAL_THRESHOLD = 0.5;
|
||||||
|
double minY;
|
||||||
|
double maxY;
|
||||||
|
double midY;
|
||||||
|
double[] histogram;
|
||||||
|
double min;
|
||||||
|
double max;
|
||||||
|
double resolution;
|
||||||
|
double sum;
|
||||||
|
int N;
|
||||||
|
|
||||||
|
|
||||||
|
public ColumnDetector(double min, double max, double minY, double maxY) {
|
||||||
|
|
||||||
|
this.min = min;
|
||||||
|
this.max = max;
|
||||||
|
this.minY = minY;
|
||||||
|
this.maxY = maxY;
|
||||||
|
this.midY = maxY - minY;
|
||||||
|
this.resolution = (max - min) / bins_num;
|
||||||
|
this.histogram = new double[bins_num];
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public void add(BoundingBox zone) {
|
||||||
|
|
||||||
|
N++;
|
||||||
|
double weight = computeWeight(zone);
|
||||||
|
int start = (int) ((zone.getMinX() - min) / resolution);
|
||||||
|
int end = (int) ((zone.getMaxX() - min) / resolution);
|
||||||
|
for (int i = start; i < end; i++) {
|
||||||
|
histogram[i] += weight;
|
||||||
|
sum += histogram[i];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private double computeWeight(BoundingBox zone) {
|
||||||
|
|
||||||
|
double areaWeight = zone.getBBox().getHeight();
|
||||||
|
|
||||||
|
double relativeDistance = relativeDistanceToMiddle(zone.getBBox().getCenterY());
|
||||||
|
|
||||||
|
double distanceWeight;
|
||||||
|
if (relativeDistance < 0.6) {
|
||||||
|
distanceWeight = 1;
|
||||||
|
} else if (relativeDistance < 0.8) {
|
||||||
|
distanceWeight = 0.8;
|
||||||
|
} else {
|
||||||
|
distanceWeight = 0.1;
|
||||||
|
}
|
||||||
|
|
||||||
|
return areaWeight * distanceWeight;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private double relativeDistanceToMiddle(double y) {
|
||||||
|
|
||||||
|
double range = (maxY - minY) / 2;
|
||||||
|
double mid = minY + range;
|
||||||
|
|
||||||
|
return Math.abs(y - mid) / range;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public double[] computeDerivative() {
|
||||||
|
|
||||||
|
int length = histogram.length;
|
||||||
|
double[] derivative = new double[length];
|
||||||
|
|
||||||
|
for (int i = 0; i < length; i++) {
|
||||||
|
if (i == 0) {
|
||||||
|
derivative[i] = (histogram[i + 1] - histogram[i]) / resolution;
|
||||||
|
} else if (i == length - 1) {
|
||||||
|
derivative[i] = (histogram[i] - histogram[i - 1]) / resolution;
|
||||||
|
} else {
|
||||||
|
derivative[i] = (histogram[i + 1] - histogram[i - 1]) / (2 * resolution);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return derivative;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public double calcMean(double[] arr, int start, int end) {
|
||||||
|
|
||||||
|
if (start == end) {
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
double sum = 0;
|
||||||
|
for (int i = start; i < end; i++) {
|
||||||
|
sum += arr[i];
|
||||||
|
}
|
||||||
|
return sum / (end - start);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/*
|
||||||
|
Find columns, by finding all local maxima/minima of the derivative. Filtering them for the ones with the biggest values.
|
||||||
|
For each found minima, we will step to the right until we hit a 0 in the derivative, this indicates a minimum in the main histogram. If this minimum is below a threshold, it is deemed a column divider.
|
||||||
|
Same goes for maxima, but stepping to the left now, since minima in the function will always be to the left of a maximum in its derivative.
|
||||||
|
*/
|
||||||
|
public List<Double> determineColumnsWithDerivative(double[] derivative) {
|
||||||
|
|
||||||
|
assert derivative.length == histogram.length;
|
||||||
|
|
||||||
|
Set<Integer> columnIndices = new HashSet<>();
|
||||||
|
double mean = calcMean(histogram, 0, histogram.length);
|
||||||
|
double maxDvValue = calcMax(derivative);
|
||||||
|
double minDvValue = calcMin(derivative);
|
||||||
|
|
||||||
|
if (maxDvValue - minDvValue < mean * MAX_VALUE_THRESHOLD) {
|
||||||
|
Collections.emptyList();
|
||||||
|
}
|
||||||
|
|
||||||
|
Extrema derivativeExtrema = calculateNearGlobalExtrema(derivative, maxDvValue, minDvValue);
|
||||||
|
|
||||||
|
List<Integer> columnsRightOfMinima = findZerosToTheRightOfMinima(derivative, derivativeExtrema.minima(), mean);
|
||||||
|
columnIndices.addAll(columnsRightOfMinima);
|
||||||
|
|
||||||
|
List<Integer> columnsLeftOfMaxima = findZerosToTheLeftOfMaxima(derivative, derivativeExtrema.maxima(), mean);
|
||||||
|
columnIndices.addAll(columnsLeftOfMaxima);
|
||||||
|
|
||||||
|
return columnIndices.stream()
|
||||||
|
.sorted(Comparator.naturalOrder())
|
||||||
|
.map(this::calculateXCoordinateFromIdx)
|
||||||
|
.toList();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private List<Integer> findZerosToTheLeftOfMaxima(double[] derivative, List<Integer> derivativeMaxima, double mean) {
|
||||||
|
|
||||||
|
List<Integer> columnsLeftOfMaxima = new ArrayList<>();
|
||||||
|
|
||||||
|
for (int i = 0; i < derivativeMaxima.size(); i++) {
|
||||||
|
List<Integer> consecutiveZeroes = new LinkedList<>();
|
||||||
|
boolean maximumFound = false;
|
||||||
|
int maximaIdx = derivativeMaxima.get(i) - 1; // the highest derivative will always be at least one step away from the lowest value.
|
||||||
|
int endIdx = (int) Math.max(globalStartIdx,
|
||||||
|
Math.min(maximaIdx - 1,
|
||||||
|
maximaIdx - 0.1 * bins_num)); // search through 10% of array to the right, but at least one step and at most to the left edge;
|
||||||
|
|
||||||
|
for (int j = maximaIdx; j >= endIdx; j--) {
|
||||||
|
if (derivative[j] < DERIVATIVE_ZERO_THRESHOLD) {
|
||||||
|
maximumFound = true;
|
||||||
|
consecutiveZeroes.add(j);
|
||||||
|
} else if (maximumFound) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (maximumFound) {
|
||||||
|
int midIdx = consecutiveZeroes.size() / 2;
|
||||||
|
int middleMinimumIdx = consecutiveZeroes.get(midIdx);
|
||||||
|
if (histogram[middleMinimumIdx] < mean * MINIMUM_THRESHOLD_FOR_COLUMNS) {
|
||||||
|
columnsLeftOfMaxima.add(middleMinimumIdx);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return columnsLeftOfMaxima;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private List<Integer> findZerosToTheRightOfMinima(double[] derivative, List<Integer> derivativeMinima, double mean) {
|
||||||
|
|
||||||
|
List<Integer> columnIndixes = new LinkedList<>();
|
||||||
|
for (int i = 0; i < derivativeMinima.size(); i++) {
|
||||||
|
List<Integer> consecutiveZeroes = new LinkedList<>();
|
||||||
|
boolean minimumFound = false;
|
||||||
|
int minimaIdx = derivativeMinima.get(i) + 1; // the highest derivative will always be at least one step earlier than the lowest value.
|
||||||
|
int endIdx = (int) Math.min(globalEndIdx,
|
||||||
|
Math.max(minimaIdx + 1,
|
||||||
|
minimaIdx + 0.1 * bins_num)); // search through 10% of array to the right, but at least one step and at most to the right edge;
|
||||||
|
|
||||||
|
for (int j = minimaIdx; j < endIdx; j++) {
|
||||||
|
if (derivative[j] < DERIVATIVE_ZERO_THRESHOLD) {
|
||||||
|
minimumFound = true;
|
||||||
|
consecutiveZeroes.add(j);
|
||||||
|
} else if (minimumFound) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (minimumFound) {
|
||||||
|
int midIdx = consecutiveZeroes.size() / 2;
|
||||||
|
int middleMinimumIdx = consecutiveZeroes.get(midIdx);
|
||||||
|
if (histogram[middleMinimumIdx] < mean * MINIMUM_THRESHOLD_FOR_COLUMNS) {
|
||||||
|
columnIndixes.add(middleMinimumIdx);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return columnIndixes;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private double calcMax(double[] array) {
|
||||||
|
|
||||||
|
double max = Double.NEGATIVE_INFINITY;
|
||||||
|
for (int i = 0; i < array.length; i++) {
|
||||||
|
if (array[i] > max) {
|
||||||
|
max = array[i];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return max;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private double calcMin(double[] array) {
|
||||||
|
|
||||||
|
double min = Double.POSITIVE_INFINITY;
|
||||||
|
for (int i = 0; i < array.length; i++) {
|
||||||
|
if (array[i] < min) {
|
||||||
|
min = array[i];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return min;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private Extrema calculateNearGlobalExtrema(double[] derivative, double maxDvValue, double minDvValue) {
|
||||||
|
|
||||||
|
List<Integer> nearGlobalDvMaximaIdx = new LinkedList<>();
|
||||||
|
List<Integer> nearGlobalDvMinimaIdx = new LinkedList<>();
|
||||||
|
for (int i = globalStartIdx; i < globalEndIdx; i++) {
|
||||||
|
if (derivative[i] <= minDvValue * NEAR_GLOBAL_THRESHOLD) {
|
||||||
|
nearGlobalDvMinimaIdx.add(i);
|
||||||
|
}
|
||||||
|
if (derivative[i] >= maxDvValue * NEAR_GLOBAL_THRESHOLD) {
|
||||||
|
nearGlobalDvMaximaIdx.add(i);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
nearGlobalDvMinimaIdx = removeConsecutive(nearGlobalDvMinimaIdx);
|
||||||
|
nearGlobalDvMaximaIdx = removeConsecutive(nearGlobalDvMaximaIdx);
|
||||||
|
|
||||||
|
return new Extrema(nearGlobalDvMaximaIdx, nearGlobalDvMinimaIdx);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private record Extrema(List<Integer> maxima, List<Integer> minima) {
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private Double calculateXCoordinateFromIdx(int globalMinIdx) {
|
||||||
|
|
||||||
|
return min + ((globalMinIdx + 1) * resolution);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public static List<Integer> removeConsecutive(List<Integer> numbers) {
|
||||||
|
|
||||||
|
List<Integer> result = new ArrayList<>();
|
||||||
|
if (numbers == null || numbers.isEmpty()) {
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
result.add(numbers.get(0)); // Add the first number
|
||||||
|
|
||||||
|
for (int i = 1; i < numbers.size(); i++) {
|
||||||
|
if (numbers.get(i) != numbers.get(i - 1) + 1) {
|
||||||
|
result.add(numbers.get(i)); // Add non-consecutive numbers
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public void kernelSmooth(double[] kernel) {
|
||||||
|
|
||||||
|
double[] newFrequencies = new double[histogram.length];
|
||||||
|
int shift = (kernel.length - 1) / 2;
|
||||||
|
for (int i = 0; i < kernel.length; i++) {
|
||||||
|
int jStart = Math.max(0, i - shift);
|
||||||
|
int jEnd = Math.min(histogram.length, histogram.length + i - shift);
|
||||||
|
for (int j = jStart; j < jEnd; j++) {
|
||||||
|
newFrequencies[j - i + shift] += kernel[i] * histogram[j];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
histogram = newFrequencies;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public double[] createGaussianKernel(int length, double stdDeviation) {
|
||||||
|
|
||||||
|
int r = length / 2;
|
||||||
|
|
||||||
|
int size = 2 * r + 1;
|
||||||
|
double[] kernel = new double[size];
|
||||||
|
double sum = 0;
|
||||||
|
double b = 2 * (stdDeviation) * (stdDeviation);
|
||||||
|
double a = 1 / Math.sqrt(Math.PI * b);
|
||||||
|
for (int i = 0; i < size; i++) {
|
||||||
|
kernel[i] = a * Math.exp(-(i - r) * (i - r) / b);
|
||||||
|
sum += kernel[i];
|
||||||
|
}
|
||||||
|
for (int i = 0; i < size; i++) {
|
||||||
|
kernel[i] /= sum;
|
||||||
|
}
|
||||||
|
return kernel;
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
@ -1,10 +1,10 @@
|
|||||||
package com.knecon.fforesight.service.layoutparser.processor.docstrum.model;
|
package com.knecon.fforesight.service.layoutparser.processor.docstrum.model;
|
||||||
|
|
||||||
import java.awt.geom.Rectangle2D;
|
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
import java.util.Arrays;
|
import java.util.Arrays;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.docstrum.utils.FastAtan2;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
|
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
|
||||||
|
|
||||||
import lombok.Data;
|
import lombok.Data;
|
||||||
@ -72,7 +72,7 @@ public class Line extends BoundingBox {
|
|||||||
|
|
||||||
public double getAngle() {
|
public double getAngle() {
|
||||||
|
|
||||||
return Math.atan2(y1 - y0, x1 - x0);
|
return FastAtan2.fastAtan2(y1 - y0, x1 - x0);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@ -84,7 +84,9 @@ public class Line extends BoundingBox {
|
|||||||
|
|
||||||
private double computeHeight() {
|
private double computeHeight() {
|
||||||
|
|
||||||
return characters.stream().map(Character::getHeight).reduce(0d, Double::sum) / characters.size();
|
return characters.stream()
|
||||||
|
.map(Character::getHeight)
|
||||||
|
.reduce(0d, Double::sum) / characters.size();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@ -116,7 +118,7 @@ public class Line extends BoundingBox {
|
|||||||
|
|
||||||
double ym = (y0 + y1) / 2;
|
double ym = (y0 + y1) / 2;
|
||||||
double yn = (other.y0 + other.y1) / 2;
|
double yn = (other.y0 + other.y1) / 2;
|
||||||
return Math.abs(ym - yn) / Math.sqrt(1);
|
return Math.abs(ym - yn);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@ -141,21 +143,9 @@ public class Line extends BoundingBox {
|
|||||||
|
|
||||||
private void buildBBox() {
|
private void buildBBox() {
|
||||||
|
|
||||||
double minX = Double.POSITIVE_INFINITY;
|
this.setToBBoxOfComponents(characters.stream()
|
||||||
double minY = Double.POSITIVE_INFINITY;
|
.map(Character::getTextPosition)
|
||||||
double maxX = Double.NEGATIVE_INFINITY;
|
.toList());
|
||||||
double maxY = Double.NEGATIVE_INFINITY;
|
|
||||||
|
|
||||||
for (Character character : characters) {
|
|
||||||
|
|
||||||
minX = Math.min(minX, character.getTextPosition().getXDirAdj());
|
|
||||||
minY = Math.min(minY, character.getTextPosition().getYDirAdj());
|
|
||||||
maxX = Math.max(maxX, character.getTextPosition().getXDirAdj() + character.getTextPosition().getWidthDirAdj());
|
|
||||||
maxY = Math.max(maxY, character.getTextPosition().getYDirAdj() + character.getTextPosition().getHeightDir());
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
this.setBBox(new Rectangle2D.Double(minX, minY, maxX - minX, maxY - minY));
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@ -1,9 +1,10 @@
|
|||||||
package com.knecon.fforesight.service.layoutparser.processor.docstrum.model;
|
package com.knecon.fforesight.service.layoutparser.processor.docstrum.model;
|
||||||
|
|
||||||
import java.awt.geom.Rectangle2D;
|
|
||||||
import java.util.Comparator;
|
import java.util.Comparator;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations;
|
||||||
|
|
||||||
import lombok.Data;
|
import lombok.Data;
|
||||||
|
|
||||||
@Data
|
@Data
|
||||||
@ -15,29 +16,9 @@ public class Zone extends BoundingBox {
|
|||||||
@SuppressWarnings("PMD.ConstructorCallsOverridableMethod")
|
@SuppressWarnings("PMD.ConstructorCallsOverridableMethod")
|
||||||
public Zone(List<Line> lines) {
|
public Zone(List<Line> lines) {
|
||||||
|
|
||||||
lines.sort(Comparator.comparingDouble(Line::getY));
|
lines.sort(Comparator.comparingDouble(Line::getY0));
|
||||||
this.lines = lines;
|
this.lines = lines;
|
||||||
buildBBox();
|
setToBBoxOfComponents(lines);
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
public void buildBBox() {
|
|
||||||
|
|
||||||
double minX = Double.POSITIVE_INFINITY;
|
|
||||||
double minY = Double.POSITIVE_INFINITY;
|
|
||||||
double maxX = Double.NEGATIVE_INFINITY;
|
|
||||||
double maxY = Double.NEGATIVE_INFINITY;
|
|
||||||
|
|
||||||
for (Line line : lines) {
|
|
||||||
|
|
||||||
minX = Math.min(minX, line.getX());
|
|
||||||
minY = Math.min(minY, line.getY());
|
|
||||||
maxX = Math.max(maxX, line.getX() + line.getWidth());
|
|
||||||
maxY = Math.max(maxY, line.getY() + line.getHeight());
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
this.setBBox(new Rectangle2D.Double(minX, minY, maxX - minX, maxY - minY));
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@ -1,6 +1,5 @@
|
|||||||
package com.knecon.fforesight.service.layoutparser.processor.docstrum.service;
|
package com.knecon.fforesight.service.layoutparser.processor.docstrum.service;
|
||||||
|
|
||||||
import java.util.ArrayList;
|
|
||||||
import java.util.Comparator;
|
import java.util.Comparator;
|
||||||
import java.util.HashSet;
|
import java.util.HashSet;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
@ -11,43 +10,49 @@ import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.Angle
|
|||||||
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.Character;
|
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.Character;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.Line;
|
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.Line;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.UnionFind;
|
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.UnionFind;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings;
|
||||||
|
|
||||||
@Service
|
@Service
|
||||||
public class LineBuilderService {
|
public class LineBuilderService {
|
||||||
|
|
||||||
private static final double CHARACTER_SPACING_DISTANCE_MULTIPLIER = 3.5;
|
private static final double CHARACTER_SPACING_DISTANCE_MULTIPLIER = 3.5;
|
||||||
private static final double MAX_VERTICAL_CHARACTER_DISTANCE = 0.67;
|
private static final double LINE_SPACING_THRESHOLD_MULTIPLIER = 0.67;
|
||||||
private static final double ANGLE_TOLERANCE = Math.PI / 6;
|
private static final double ANGLE_TOLERANCE = Math.PI / 6;
|
||||||
|
|
||||||
|
|
||||||
public List<Line> buildLines(List<Character> characters, double characterSpacing, double lineSpacing) {
|
public List<Line> buildLines(List<Character> characters, double characterSpacing, double lineSpacing, CleanRulings rulings) {
|
||||||
|
|
||||||
double maxHorizontalDistance = characterSpacing * CHARACTER_SPACING_DISTANCE_MULTIPLIER;
|
double maxHorizontalDistance = characterSpacing * CHARACTER_SPACING_DISTANCE_MULTIPLIER;
|
||||||
double maxVerticalDistance = lineSpacing * MAX_VERTICAL_CHARACTER_DISTANCE;
|
double maxVerticalDistance = lineSpacing * LINE_SPACING_THRESHOLD_MULTIPLIER;
|
||||||
|
|
||||||
UnionFind<Character> unionFind = new UnionFind<>(new HashSet<>(characters));
|
UnionFind<Character> unionFind = new UnionFind<>(new HashSet<>(characters));
|
||||||
|
|
||||||
AngleFilter filter = new AngleFilter(-ANGLE_TOLERANCE, ANGLE_TOLERANCE);
|
AngleFilter angleFilter = new AngleFilter(-ANGLE_TOLERANCE, ANGLE_TOLERANCE);
|
||||||
|
|
||||||
characters.forEach(character -> {
|
characters.forEach(character -> {
|
||||||
character.getNeighbors().forEach(neighbor -> {
|
character.getNeighbors()
|
||||||
double x = neighbor.getHorizontalDistance() / maxHorizontalDistance;
|
.forEach(neighbor -> {
|
||||||
double y = neighbor.getVerticalDistance() / maxVerticalDistance;
|
double normalizedHorizontalDistance = neighbor.getHorizontalDistance() / maxHorizontalDistance;
|
||||||
if (character.getTextPosition().getDir() == neighbor.getCharacter().getTextPosition().getDir() && filter.matches(neighbor) && Math.pow(x, 2) + Math.pow(y,
|
double normalizedVerticalDistance = neighbor.getVerticalDistance() / maxVerticalDistance;
|
||||||
2) <= 1) {
|
|
||||||
unionFind.union(character, neighbor.getCharacter());
|
if (character.getTextPosition().getDir() != neighbor.getCharacter().getTextPosition().getDir() //
|
||||||
}
|
|| !angleFilter.matches(neighbor) //
|
||||||
});
|
|| Math.pow(normalizedHorizontalDistance, 2) + Math.pow(normalizedVerticalDistance, 2) > 1 //
|
||||||
|
|| rulings.lineBetween(character.getTextPosition(), neighbor.getCharacter().getTextPosition())) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
unionFind.union(character, neighbor.getCharacter());
|
||||||
|
});
|
||||||
});
|
});
|
||||||
|
|
||||||
List<Line> lines = new ArrayList<>();
|
return unionFind.getGroups()
|
||||||
unionFind.getGroups().forEach(group -> {
|
.stream()
|
||||||
List<Character> lineCharacters = new ArrayList<>(group);
|
.map(lineCharacters -> lineCharacters.stream()
|
||||||
lineCharacters.sort(Comparator.comparingDouble(Character::getX));
|
.sorted(Comparator.comparingDouble(Character::getX))
|
||||||
lines.add(new Line(lineCharacters, characterSpacing));
|
.toList())
|
||||||
});
|
.map(lineCharacters -> new Line(lineCharacters, characterSpacing))
|
||||||
|
.toList();
|
||||||
return lines;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|||||||
@ -39,7 +39,10 @@ public class ReadingOrderService {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (histogram.values().stream().mapToInt(Integer::intValue).average().orElse(1) < MULTI_COLUMN_DETECTION_THRESHOLD) {
|
if (histogram.values()
|
||||||
|
.stream()
|
||||||
|
.mapToInt(Integer::intValue).average()
|
||||||
|
.orElse(1) < MULTI_COLUMN_DETECTION_THRESHOLD) {
|
||||||
return resolveSingleColumnReadingOrder(zones);
|
return resolveSingleColumnReadingOrder(zones);
|
||||||
} else {
|
} else {
|
||||||
|
|
||||||
@ -52,7 +55,7 @@ public class ReadingOrderService {
|
|||||||
private static List<Zone> resolveSingleColumnReadingOrder(List<Zone> zones) {
|
private static List<Zone> resolveSingleColumnReadingOrder(List<Zone> zones) {
|
||||||
|
|
||||||
zones.sort(Comparator.comparing(BoundingBox::getY, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD))
|
zones.sort(Comparator.comparing(BoundingBox::getY, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD))
|
||||||
.thenComparing(BoundingBox::getX, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD)));
|
.thenComparing(BoundingBox::getX, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD)));
|
||||||
return zones;
|
return zones;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -90,14 +93,14 @@ public class ReadingOrderService {
|
|||||||
}
|
}
|
||||||
|
|
||||||
leftOf.sort(Comparator.comparing(BoundingBox::getY, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD))
|
leftOf.sort(Comparator.comparing(BoundingBox::getY, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD))
|
||||||
.thenComparing(BoundingBox::getX, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD)));
|
.thenComparing(BoundingBox::getX, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD)));
|
||||||
|
|
||||||
rightOf.sort(Comparator.comparing(BoundingBox::getY, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD))
|
rightOf.sort(Comparator.comparing(BoundingBox::getY, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD))
|
||||||
.thenComparing(BoundingBox::getX, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD)));
|
.thenComparing(BoundingBox::getX, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD)));
|
||||||
|
|
||||||
middle.sort(Comparator.comparing(BoundingBox::getY, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD))
|
middle.sort(Comparator.comparing(BoundingBox::getY, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD))
|
||||||
.thenComparing(BoundingBox::getX, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD)));
|
.thenComparing(BoundingBox::getX, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD)));
|
||||||
|
/*
|
||||||
List<Zone> leftNotIntersecting = new ArrayList<>();
|
List<Zone> leftNotIntersecting = new ArrayList<>();
|
||||||
for (Zone leftZone : leftOf) {
|
for (Zone leftZone : leftOf) {
|
||||||
boolean intersects = false;
|
boolean intersects = false;
|
||||||
@ -139,7 +142,7 @@ public class ReadingOrderService {
|
|||||||
|
|
||||||
middle.addAll(leftNotIntersecting);
|
middle.addAll(leftNotIntersecting);
|
||||||
middle.addAll(rightNotIntersecting);
|
middle.addAll(rightNotIntersecting);
|
||||||
|
*/
|
||||||
List<Zone> sortedZones = new ArrayList<>();
|
List<Zone> sortedZones = new ArrayList<>();
|
||||||
sortedZones.addAll(leftOf);
|
sortedZones.addAll(leftOf);
|
||||||
sortedZones.addAll(rightOf);
|
sortedZones.addAll(rightOf);
|
||||||
|
|||||||
@ -5,6 +5,7 @@ import java.util.Comparator;
|
|||||||
import java.util.HashSet;
|
import java.util.HashSet;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.Set;
|
import java.util.Set;
|
||||||
|
import java.util.stream.Collectors;
|
||||||
|
|
||||||
import org.springframework.stereotype.Service;
|
import org.springframework.stereotype.Service;
|
||||||
|
|
||||||
@ -12,6 +13,7 @@ import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.Chara
|
|||||||
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.Line;
|
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.Line;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.UnionFind;
|
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.UnionFind;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.Zone;
|
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.Zone;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings;
|
||||||
|
|
||||||
@Service
|
@Service
|
||||||
public class ZoneBuilderService {
|
public class ZoneBuilderService {
|
||||||
@ -29,12 +31,10 @@ public class ZoneBuilderService {
|
|||||||
|
|
||||||
private static final double ANGLE_TOLERANCE = Math.PI / 6;
|
private static final double ANGLE_TOLERANCE = Math.PI / 6;
|
||||||
|
|
||||||
private static final int MAX_ZONES = 300;
|
|
||||||
|
|
||||||
private static final double MAX_VERTICAL_MERGE_DISTANCE = 0.5;
|
private static final double MAX_VERTICAL_MERGE_DISTANCE = 0.5;
|
||||||
|
|
||||||
|
|
||||||
public List<Zone> buildZones(List<Line> lines, double characterSpacing, double lineSpacing) {
|
public List<Zone> buildZones(List<Line> lines, double characterSpacing, double lineSpacing, CleanRulings rulings) {
|
||||||
|
|
||||||
double minHorizontalDistance = characterSpacing * MIN_HORIZONTAL_DISTANCE_MULTIPLIER;
|
double minHorizontalDistance = characterSpacing * MIN_HORIZONTAL_DISTANCE_MULTIPLIER;
|
||||||
double maxVerticalDistance = lineSpacing * MAX_VERTICAL_DISTANCE_MULTIPLIER;
|
double maxVerticalDistance = lineSpacing * MAX_VERTICAL_DISTANCE_MULTIPLIER;
|
||||||
@ -45,38 +45,39 @@ public class ZoneBuilderService {
|
|||||||
|
|
||||||
double meanHeight = calculateMeanHeight(lines);
|
double meanHeight = calculateMeanHeight(lines);
|
||||||
|
|
||||||
lines.forEach(outerLine -> //
|
lines.forEach(outerLine -> {
|
||||||
lines.forEach(innerLine -> {
|
lines.forEach(innerLine -> {
|
||||||
|
|
||||||
double scale = Math.min(outerLine.getHeight(), innerLine.getHeight()) / meanHeight;
|
if (innerLine == outerLine //
|
||||||
scale = Math.max(MIN_LINE_SIZE_SCALE, Math.min(scale, MAX_LINE_SIZE_SCALE));
|
|| unionFind.inSameSet(outerLine, innerLine)//
|
||||||
|
|| outerLine.angularDifference(innerLine) > ANGLE_TOLERANCE) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
if (!unionFind.inSameSet(outerLine, innerLine) && outerLine.angularDifference(innerLine) <= ANGLE_TOLERANCE) {
|
double scale = Math.min(outerLine.getHeight(), innerLine.getHeight()) / meanHeight;
|
||||||
|
scale = Math.max(MIN_LINE_SIZE_SCALE, Math.min(scale, MAX_LINE_SIZE_SCALE));
|
||||||
|
|
||||||
double horizontalDistance = outerLine.horizontalDistance(innerLine) / scale;
|
double horizontalDistance = outerLine.horizontalDistance(innerLine) / scale;
|
||||||
double verticalDistance = outerLine.verticalDistance(innerLine) / scale;
|
double verticalDistance = outerLine.verticalDistance(innerLine) / scale;
|
||||||
|
|
||||||
if (minHorizontalDistance <= horizontalDistance && verticalDistance <= maxVerticalDistance //
|
if ((!(minHorizontalDistance <= horizontalDistance) || !(verticalDistance <= maxVerticalDistance)) //
|
||||||
|| minHorizontalMergeDistance <= horizontalDistance && verticalDistance <= maxVerticalMergeDistance) {
|
&& (!(minHorizontalMergeDistance <= horizontalDistance) || !(verticalDistance <= maxVerticalMergeDistance))) {
|
||||||
unionFind.union(outerLine, innerLine);
|
return;
|
||||||
}
|
}
|
||||||
}
|
|
||||||
}));
|
|
||||||
|
|
||||||
List<Zone> zones = new ArrayList<>();
|
if (rulings.lineBetween(outerLine, innerLine)) {
|
||||||
unionFind.getGroups().forEach(group -> {
|
return;
|
||||||
zones.add(mergeLinesInZone(new ArrayList<>(group), characterSpacing, lineSpacing));
|
}
|
||||||
|
|
||||||
|
unionFind.union(outerLine, innerLine);
|
||||||
|
|
||||||
|
});
|
||||||
});
|
});
|
||||||
|
|
||||||
if (zones.size() > MAX_ZONES) {
|
return unionFind.getGroups()
|
||||||
List<Line> oneZoneLines = new ArrayList<>();
|
.stream()
|
||||||
for (Zone zone : zones) {
|
.map(group -> mergeLinesInZone(new ArrayList<>(group), characterSpacing, lineSpacing))
|
||||||
oneZoneLines.addAll(zone.getLines());
|
.toList();
|
||||||
}
|
|
||||||
return List.of(mergeLinesInZone(oneZoneLines, characterSpacing, lineSpacing));
|
|
||||||
}
|
|
||||||
|
|
||||||
return zones;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@ -103,35 +104,40 @@ public class ZoneBuilderService {
|
|||||||
UnionFind<Line> unionFind = new UnionFind<>(new HashSet<>(lines));
|
UnionFind<Line> unionFind = new UnionFind<>(new HashSet<>(lines));
|
||||||
|
|
||||||
lines.forEach(outer -> {
|
lines.forEach(outer -> {
|
||||||
|
|
||||||
lines.forEach(inner -> {
|
lines.forEach(inner -> {
|
||||||
if (inner != outer) {
|
if (inner == outer) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
double horizontalDistance = outer.horizontalDistance(inner);
|
double horizontalDistance = outer.horizontalDistance(inner);
|
||||||
double verticalDistance = outer.verticalDistance(inner);
|
double verticalDistance = outer.verticalDistance(inner);
|
||||||
|
|
||||||
if (horizontalDistance <= maxHorizontalDistance && minVerticalDistance <= verticalDistance && verticalDistance <= maxVerticalDistance) {
|
if (horizontalDistance <= maxHorizontalDistance && minVerticalDistance <= verticalDistance && verticalDistance <= maxVerticalDistance) {
|
||||||
unionFind.union(outer, inner);
|
|
||||||
} else if (minVerticalDistance <= verticalDistance && verticalDistance <= maxVerticalDistance && Math.abs(horizontalDistance - Math.min(outer.getLength(),
|
unionFind.union(outer, inner);
|
||||||
inner.getLength())) < 0.1) {
|
|
||||||
boolean characterOverlap = false;
|
} else if (minVerticalDistance <= verticalDistance
|
||||||
int overlappingCount = 0;
|
&& verticalDistance <= maxVerticalDistance
|
||||||
for (Character outerCharacter : outer.getCharacters()) {
|
&& Math.abs(horizontalDistance - Math.min(outer.getLength(), inner.getLength())) < 0.1) {
|
||||||
for (Character innerCharacter : inner.getCharacters()) {
|
|
||||||
double characterOverlapDistance = outerCharacter.overlappingDistance(innerCharacter);
|
boolean characterOverlap = false;
|
||||||
if (characterOverlapDistance > 2) {
|
int overlappingCount = 0;
|
||||||
characterOverlap = true;
|
for (Character outerCharacter : outer.getCharacters()) {
|
||||||
}
|
for (Character innerCharacter : inner.getCharacters()) {
|
||||||
if (characterOverlapDistance > 0) {
|
double characterOverlapDistance = outerCharacter.overlappingDistance(innerCharacter);
|
||||||
overlappingCount++;
|
if (characterOverlapDistance > 2) {
|
||||||
}
|
characterOverlap = true;
|
||||||
|
}
|
||||||
|
if (characterOverlapDistance > 0) {
|
||||||
|
overlappingCount++;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (!characterOverlap && overlappingCount <= 2) {
|
}
|
||||||
unionFind.union(outer, inner);
|
if (!characterOverlap && overlappingCount <= 2) {
|
||||||
}
|
unionFind.union(outer, inner);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
});
|
});
|
||||||
});
|
});
|
||||||
|
|
||||||
@ -146,7 +152,9 @@ public class ZoneBuilderService {
|
|||||||
outputZone.add(new Line(characters, characterSpacing));
|
outputZone.add(new Line(characters, characterSpacing));
|
||||||
}
|
}
|
||||||
|
|
||||||
return new Zone(outputZone);
|
return new Zone(outputZone.stream()
|
||||||
|
.sorted(Comparator.comparing(Line::getY0))
|
||||||
|
.collect(Collectors.toList()));
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|||||||
@ -1,7 +1,10 @@
|
|||||||
package com.knecon.fforesight.service.layoutparser.processor.model;
|
package com.knecon.fforesight.service.layoutparser.processor.model;
|
||||||
|
|
||||||
|
import java.awt.geom.Rectangle2D;
|
||||||
|
|
||||||
import com.fasterxml.jackson.annotation.JsonIgnore;
|
import com.fasterxml.jackson.annotation.JsonIgnore;
|
||||||
import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.Rectangle;
|
import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.Rectangle;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.BoundingBox;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
|
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
|
||||||
|
|
||||||
import lombok.AllArgsConstructor;
|
import lombok.AllArgsConstructor;
|
||||||
@ -13,16 +16,8 @@ import lombok.NoArgsConstructor;
|
|||||||
@AllArgsConstructor
|
@AllArgsConstructor
|
||||||
@NoArgsConstructor
|
@NoArgsConstructor
|
||||||
@EqualsAndHashCode(callSuper = true)
|
@EqualsAndHashCode(callSuper = true)
|
||||||
public abstract class AbstractPageBlock extends Rectangle {
|
public abstract class AbstractPageBlock extends BoundingBox {
|
||||||
|
|
||||||
@JsonIgnore
|
|
||||||
protected float minX;
|
|
||||||
@JsonIgnore
|
|
||||||
protected float maxX;
|
|
||||||
@JsonIgnore
|
|
||||||
protected float minY;
|
|
||||||
@JsonIgnore
|
|
||||||
protected float maxY;
|
|
||||||
@JsonIgnore
|
@JsonIgnore
|
||||||
protected PageBlockType classification;
|
protected PageBlockType classification;
|
||||||
@JsonIgnore
|
@JsonIgnore
|
||||||
@ -41,63 +36,6 @@ public abstract class AbstractPageBlock extends Rectangle {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
public boolean containsBlock(TextPageBlock other) {
|
|
||||||
|
|
||||||
return this.minX <= other.getPdfMinX() && this.maxX >= other.getPdfMaxX() && this.minY >= other.getPdfMinY() && this.maxY <= other.getPdfMaxY();
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
public boolean contains(AbstractPageBlock other) {
|
|
||||||
|
|
||||||
return this.minX <= other.minX && this.maxX >= other.maxX && this.minY >= other.minY && this.maxY <= other.maxY;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
public boolean contains(Rectangle other) {
|
|
||||||
|
|
||||||
return page == other.getPage() && this.minX <= other.getTopLeft().getX() && this.maxX >= other.getTopLeft().getX() + other.getWidth() && this.minY <= other.getTopLeft()
|
|
||||||
.getY() && this.maxY >= other.getTopLeft().getY() + other.getHeight();
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
@JsonIgnore
|
|
||||||
public float getHeight() {
|
|
||||||
|
|
||||||
return maxY - minY;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
@JsonIgnore
|
|
||||||
public float getWidth() {
|
|
||||||
|
|
||||||
return maxX - minX;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
public boolean intersectsY(AbstractPageBlock apb) {
|
|
||||||
|
|
||||||
return this.minY <= apb.getMaxY() && this.maxY >= apb.getMinY();
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
public boolean almostIntersects(AbstractPageBlock apb, float yThreshold, float xThreshold) {
|
|
||||||
|
|
||||||
return this.almostIntersectsX(apb, xThreshold) && this.almostIntersectsY(apb, yThreshold);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
private boolean almostIntersectsY(AbstractPageBlock apb, float threshold) {
|
|
||||||
|
|
||||||
return this.minY - threshold <= apb.getMaxY() && this.maxY + threshold >= apb.getMinY();
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
private boolean almostIntersectsX(AbstractPageBlock apb, float threshold) {
|
|
||||||
|
|
||||||
return this.minX - threshold <= apb.getMaxX() && this.maxX + threshold >= apb.getMinX();
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
public abstract boolean isEmpty();
|
public abstract boolean isEmpty();
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|||||||
@ -7,6 +7,7 @@ import com.knecon.fforesight.service.layoutparser.processor.model.outline.Outlin
|
|||||||
import com.knecon.fforesight.service.layoutparser.processor.model.outline.TableOfContents;
|
import com.knecon.fforesight.service.layoutparser.processor.model.outline.TableOfContents;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.StringFrequencyCounter;
|
import com.knecon.fforesight.service.layoutparser.processor.model.text.StringFrequencyCounter;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.UnclassifiedText;
|
import com.knecon.fforesight.service.layoutparser.processor.model.text.UnclassifiedText;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.visualization.LayoutparsingVisualizations;
|
||||||
|
|
||||||
import lombok.Data;
|
import lombok.Data;
|
||||||
import lombok.NoArgsConstructor;
|
import lombok.NoArgsConstructor;
|
||||||
@ -24,6 +25,7 @@ public class ClassificationDocument {
|
|||||||
private FloatFrequencyCounter fontSizeCounter = new FloatFrequencyCounter();
|
private FloatFrequencyCounter fontSizeCounter = new FloatFrequencyCounter();
|
||||||
private StringFrequencyCounter fontCounter = new StringFrequencyCounter();
|
private StringFrequencyCounter fontCounter = new StringFrequencyCounter();
|
||||||
private StringFrequencyCounter fontStyleCounter = new StringFrequencyCounter();
|
private StringFrequencyCounter fontStyleCounter = new StringFrequencyCounter();
|
||||||
|
private LayoutparsingVisualizations visualizations = new LayoutparsingVisualizations();
|
||||||
private boolean headlines;
|
private boolean headlines;
|
||||||
|
|
||||||
private long rulesVersion;
|
private long rulesVersion;
|
||||||
|
|||||||
@ -12,6 +12,7 @@ import java.util.stream.Stream;
|
|||||||
|
|
||||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.LayoutEngine;
|
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.LayoutEngine;
|
||||||
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeType;
|
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeType;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.visualization.LayoutparsingVisualizations;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.DocumentTree;
|
import com.knecon.fforesight.service.layoutparser.processor.model.graph.DocumentTree;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.entity.RedactionEntity;
|
import com.knecon.fforesight.service.layoutparser.processor.model.graph.entity.RedactionEntity;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlock;
|
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlock;
|
||||||
@ -40,6 +41,8 @@ public class Document implements GenericSemanticNode {
|
|||||||
@Builder.Default
|
@Builder.Default
|
||||||
Set<RedactionEntity> entities = new HashSet<>();
|
Set<RedactionEntity> entities = new HashSet<>();
|
||||||
|
|
||||||
|
LayoutparsingVisualizations visualizations;
|
||||||
|
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public NodeType getType() {
|
public NodeType getType() {
|
||||||
|
|||||||
@ -1,11 +1,13 @@
|
|||||||
package com.knecon.fforesight.service.layoutparser.processor.model.table;
|
package com.knecon.fforesight.service.layoutparser.processor.model.table;
|
||||||
|
|
||||||
|
import java.awt.geom.AffineTransform;
|
||||||
import java.awt.geom.Point2D;
|
import java.awt.geom.Point2D;
|
||||||
import java.awt.geom.Rectangle2D;
|
import java.awt.geom.Rectangle2D;
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
import java.util.Iterator;
|
import java.util.Iterator;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.BoundingBox;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
|
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
|
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.utils.TextNormalizationUtilities;
|
import com.knecon.fforesight.service.layoutparser.processor.utils.TextNormalizationUtilities;
|
||||||
@ -18,7 +20,7 @@ import lombok.NoArgsConstructor;
|
|||||||
@Data
|
@Data
|
||||||
@EqualsAndHashCode(callSuper = true)
|
@EqualsAndHashCode(callSuper = true)
|
||||||
@NoArgsConstructor
|
@NoArgsConstructor
|
||||||
public class Cell extends Rectangle {
|
public class Cell extends BoundingBox {
|
||||||
|
|
||||||
private List<TextPageBlock> textBlocks = new ArrayList<>();
|
private List<TextPageBlock> textBlocks = new ArrayList<>();
|
||||||
|
|
||||||
@ -33,13 +35,24 @@ public class Cell extends Rectangle {
|
|||||||
|
|
||||||
public Cell(Point2D topLeft, Point2D bottomRight) {
|
public Cell(Point2D topLeft, Point2D bottomRight) {
|
||||||
|
|
||||||
super((float) topLeft.getY(), (float) topLeft.getX(), (float) (bottomRight.getX() - topLeft.getX()), (float) (bottomRight.getY() - topLeft.getY()));
|
this.bBoxInitialUserSpace = new Rectangle2D.Double(topLeft.getX(), topLeft.getY(), (bottomRight.getX() - topLeft.getX()), (bottomRight.getY() - topLeft.getY()));
|
||||||
|
this.bBox = bBoxInitialUserSpace;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
public Cell(Rectangle2D r) {
|
public Cell(Rectangle2D bBoxInitialUserSpace, AffineTransform initialUserSpaceToJava) {
|
||||||
|
|
||||||
super((float) r.getY(), (float) r.getX(), (float) r.getWidth(), (float) r.getHeight());
|
this.bBoxInitialUserSpace = bBoxInitialUserSpace;
|
||||||
|
this.bBox = initialUserSpaceToJava.createTransformedShape(bBoxInitialUserSpace).getBounds2D();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public static Cell copy(Cell cell) {
|
||||||
|
|
||||||
|
Cell copy = new Cell();
|
||||||
|
copy.bBoxInitialUserSpace = cell.bBoxInitialUserSpace;
|
||||||
|
copy.bBox = cell.bBox;
|
||||||
|
return copy;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@ -1,15 +1,206 @@
|
|||||||
package com.knecon.fforesight.service.layoutparser.processor.model.table;
|
package com.knecon.fforesight.service.layoutparser.processor.model.table;
|
||||||
|
|
||||||
|
import java.awt.geom.Line2D;
|
||||||
|
import java.awt.geom.Point2D;
|
||||||
|
import java.awt.geom.Rectangle2D;
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.Collection;
|
||||||
|
import java.util.Collections;
|
||||||
|
import java.util.Comparator;
|
||||||
|
import java.util.LinkedList;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
import java.util.stream.Stream;
|
||||||
|
|
||||||
import lombok.Builder;
|
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.BoundingBox;
|
||||||
import lombok.Data;
|
|
||||||
|
|
||||||
@Data
|
import lombok.Getter;
|
||||||
@Builder
|
|
||||||
|
@Getter
|
||||||
public class CleanRulings {
|
public class CleanRulings {
|
||||||
|
|
||||||
List<Ruling> horizontal;
|
List<Ruling> horizontals; // unmodifiable sorted by Y list
|
||||||
List<Ruling> vertical;
|
List<Ruling> verticals; // unmodifiable sorted by X list
|
||||||
|
|
||||||
|
|
||||||
|
public CleanRulings(List<Ruling> horizontals, List<Ruling> verticals) {
|
||||||
|
|
||||||
|
this.horizontals = horizontals.stream()
|
||||||
|
.peek(Ruling::assertHorizontal)
|
||||||
|
.sorted(Comparator.comparing(Line2D.Float::getY1))
|
||||||
|
.toList();
|
||||||
|
this.verticals = verticals.stream()
|
||||||
|
.peek(Ruling::assertVertical)
|
||||||
|
.sorted(Comparator.comparing(Line2D.Float::getX1))
|
||||||
|
.toList();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public CleanRulings getTableLines() {
|
||||||
|
|
||||||
|
return new CleanRulings(horizontals.stream()
|
||||||
|
.filter(ruling -> ruling.getClassification().equals(Ruling.Classification.TABLE_LINE))
|
||||||
|
.toList(),
|
||||||
|
verticals.stream()
|
||||||
|
.filter(ruling -> ruling.getClassification().equals(Ruling.Classification.TABLE_LINE))
|
||||||
|
.toList());
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public CleanRulings withoutTextRulings() {
|
||||||
|
|
||||||
|
return new CleanRulings(horizontals.stream()
|
||||||
|
.filter(ruling -> !(ruling.getClassification().equals(Ruling.Classification.UNDERLINE) || ruling.getClassification()
|
||||||
|
.equals(Ruling.Classification.STRIKETROUGH)))
|
||||||
|
.toList(),
|
||||||
|
verticals.stream()
|
||||||
|
.filter(ruling -> !(ruling.getClassification().equals(Ruling.Classification.UNDERLINE) || ruling.getClassification()
|
||||||
|
.equals(Ruling.Classification.STRIKETROUGH)))
|
||||||
|
.toList());
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public List<Ruling> buildAll() {
|
||||||
|
|
||||||
|
ArrayList<Ruling> rulings = new ArrayList<>(horizontals.size() + verticals.size());
|
||||||
|
rulings.addAll(horizontals);
|
||||||
|
rulings.addAll(verticals);
|
||||||
|
return rulings;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public boolean lineBetween(BoundingBox a, BoundingBox b) {
|
||||||
|
|
||||||
|
return lineBetween(a.getBBoxInitialUserSpace(), b.getBBoxInitialUserSpace());
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public boolean lineBetween(Rectangle2D a, Rectangle2D b) {
|
||||||
|
|
||||||
|
return lineBetween(new Point2D.Double(a.getCenterX(), a.getCenterY()), new Point2D.Double(b.getCenterX(), b.getCenterY()));
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public boolean lineBetween(Point2D p1, Point2D p2) {
|
||||||
|
|
||||||
|
Ruling ruling = new Ruling(p1, p2);
|
||||||
|
|
||||||
|
if (ruling.isHorizontal()) {
|
||||||
|
return getVerticalsInXInterval(ruling.x1, ruling.x2).stream()
|
||||||
|
.anyMatch(vertical -> vertical.intersectsLine(ruling));
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
if (ruling.isVertical()) {
|
||||||
|
return getHorizontalsInYInterval(ruling.y1, ruling.y2).stream()
|
||||||
|
.anyMatch(horizontal -> horizontal.intersectsLine(ruling));
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
return Stream.of(getVerticalsInXInterval(ruling.x1, ruling.x2), getHorizontalsInYInterval(ruling.y1, ruling.y2))
|
||||||
|
.flatMap(Collection::stream)
|
||||||
|
.anyMatch(other -> other.intersectsLine(ruling));
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public List<Ruling> getHorizontalsInYInterval(float y1, float y2) {
|
||||||
|
|
||||||
|
float startY = Math.min(y1, y2);
|
||||||
|
float endY = Math.max(y1, y2);
|
||||||
|
|
||||||
|
if (horizontals.isEmpty() || Float.isNaN(startY) || Float.isNaN(endY)) {
|
||||||
|
return Collections.emptyList();
|
||||||
|
}
|
||||||
|
|
||||||
|
int firstGreaterThanIdx = findFirstHorizontalRulingIdxAbove(startY);
|
||||||
|
|
||||||
|
if (firstGreaterThanIdx == -1) {
|
||||||
|
return Collections.emptyList();
|
||||||
|
}
|
||||||
|
|
||||||
|
List<Ruling> result = new LinkedList<>();
|
||||||
|
for (int i = firstGreaterThanIdx; i < horizontals.size(); i++) {
|
||||||
|
Ruling horizontal = horizontals.get(i);
|
||||||
|
if (horizontal.y1 > endY) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
result.add(horizontal);
|
||||||
|
}
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private int findFirstHorizontalRulingIdxAbove(float y) {
|
||||||
|
|
||||||
|
int low = 0;
|
||||||
|
int high = horizontals.size() - 1;
|
||||||
|
|
||||||
|
while (low <= high) {
|
||||||
|
int mid = low + (high - low) / 2;
|
||||||
|
Line2D.Float midLine = horizontals.get(mid);
|
||||||
|
float midY = midLine.y1;
|
||||||
|
|
||||||
|
if (midY == y) {
|
||||||
|
return mid;
|
||||||
|
} else if (midY > y) {
|
||||||
|
high = mid - 1;
|
||||||
|
} else {
|
||||||
|
low = mid + 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Return the index of the first element greater than y or -1 if not found
|
||||||
|
return horizontals.size() > low && horizontals.get(low).y1 > y ? low : -1;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public List<Ruling> getVerticalsInXInterval(float x1, float x2) {
|
||||||
|
|
||||||
|
float startX = Math.min(x1, x2);
|
||||||
|
float endX = Math.max(x1, x2);
|
||||||
|
|
||||||
|
if (verticals.isEmpty() || Float.isNaN(startX) || Float.isNaN(endX)) {
|
||||||
|
return Collections.emptyList();
|
||||||
|
}
|
||||||
|
|
||||||
|
int firstGreaterThanIdx = findFirstVerticalRulingIdxRightOf(startX);
|
||||||
|
|
||||||
|
if (firstGreaterThanIdx == -1) {
|
||||||
|
return Collections.emptyList();
|
||||||
|
}
|
||||||
|
|
||||||
|
List<Ruling> result = new LinkedList<>();
|
||||||
|
for (int i = firstGreaterThanIdx; i < verticals.size(); i++) {
|
||||||
|
Ruling horizontal = verticals.get(i);
|
||||||
|
if (horizontal.x1 > endX) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
result.add(horizontal);
|
||||||
|
}
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private int findFirstVerticalRulingIdxRightOf(float x) {
|
||||||
|
|
||||||
|
int low = 0;
|
||||||
|
int high = verticals.size() - 1;
|
||||||
|
|
||||||
|
while (low <= high) {
|
||||||
|
int mid = low + (high - low) / 2;
|
||||||
|
Line2D.Float midLine = verticals.get(mid);
|
||||||
|
float midX = midLine.x1;
|
||||||
|
|
||||||
|
if (midX == x) {
|
||||||
|
return mid;
|
||||||
|
} else if (midX > x) {
|
||||||
|
high = mid - 1;
|
||||||
|
} else {
|
||||||
|
low = mid + 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Return the index of the first element greater than y or -1 if not found
|
||||||
|
return verticals.size() > low && verticals.get(low).x1 > x ? low : -1;
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|||||||
@ -1,218 +0,0 @@
|
|||||||
package com.knecon.fforesight.service.layoutparser.processor.model.table;
|
|
||||||
|
|
||||||
import java.awt.geom.Point2D;
|
|
||||||
import java.awt.geom.Rectangle2D;
|
|
||||||
import java.util.Comparator;
|
|
||||||
import java.util.List;
|
|
||||||
|
|
||||||
@SuppressWarnings("all")
|
|
||||||
public class Rectangle extends Rectangle2D.Float {
|
|
||||||
|
|
||||||
protected static final float VERTICAL_COMPARISON_THRESHOLD = 0.4f;
|
|
||||||
/**
|
|
||||||
* Ill-defined comparator, from when Rectangle was Comparable.
|
|
||||||
* <p>
|
|
||||||
* see https://github.com/tabulapdf/tabula-java/issues/116
|
|
||||||
*
|
|
||||||
* @deprecated with no replacement
|
|
||||||
*/
|
|
||||||
@Deprecated
|
|
||||||
public static final Comparator<Rectangle> ILL_DEFINED_ORDER = new Comparator<Rectangle>() {
|
|
||||||
@Override
|
|
||||||
public int compare(Rectangle o1, Rectangle o2) {
|
|
||||||
|
|
||||||
if (o1.equals(o2)) {
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
if (o1.verticalOverlap(o2) > VERTICAL_COMPARISON_THRESHOLD) {
|
|
||||||
return o1.isLtrDominant() == -1 && o2.isLtrDominant() == -1 ? -java.lang.Double.compare(o1.getX(), o2.getX()) : java.lang.Double.compare(o1.getX(), o2.getX());
|
|
||||||
} else {
|
|
||||||
return java.lang.Float.compare(o1.getBottom(), o2.getBottom());
|
|
||||||
}
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
|
|
||||||
public Rectangle() {
|
|
||||||
|
|
||||||
super();
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
public Rectangle(float top, float left, float width, float height) {
|
|
||||||
|
|
||||||
super();
|
|
||||||
this.setRect(left, top, width, height);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
/**
|
|
||||||
* @param rectangles
|
|
||||||
* @return minimum bounding box that contains all the rectangles
|
|
||||||
*/
|
|
||||||
public static Rectangle boundingBoxOf(List<? extends Rectangle> rectangles) {
|
|
||||||
|
|
||||||
float minx = java.lang.Float.MAX_VALUE;
|
|
||||||
float miny = java.lang.Float.MAX_VALUE;
|
|
||||||
float maxx = java.lang.Float.MIN_VALUE;
|
|
||||||
float maxy = java.lang.Float.MIN_VALUE;
|
|
||||||
|
|
||||||
for (Rectangle r : rectangles) {
|
|
||||||
minx = (float) Math.min(r.getMinX(), minx);
|
|
||||||
miny = (float) Math.min(r.getMinY(), miny);
|
|
||||||
maxx = (float) Math.max(r.getMaxX(), maxx);
|
|
||||||
maxy = (float) Math.max(r.getMaxY(), maxy);
|
|
||||||
}
|
|
||||||
return new Rectangle(miny, minx, maxx - minx, maxy - miny);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
public int compareTo(Rectangle other) {
|
|
||||||
|
|
||||||
return ILL_DEFINED_ORDER.compare(this, other);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
// I'm bad at Java and need this for fancy sorting in
|
|
||||||
// technology.tabula.TextChunk.
|
|
||||||
public int isLtrDominant() {
|
|
||||||
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
public float getArea() {
|
|
||||||
|
|
||||||
return this.width * this.height;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
public float verticalOverlap(Rectangle other) {
|
|
||||||
|
|
||||||
return Math.max(0, Math.min(this.getBottom(), other.getBottom()) - Math.max(this.getTop(), other.getTop()));
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
public boolean verticallyOverlaps(Rectangle other) {
|
|
||||||
|
|
||||||
return verticalOverlap(other) > 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
public float horizontalOverlap(Rectangle other) {
|
|
||||||
|
|
||||||
return Math.max(0, Math.min(this.getRight(), other.getRight()) - Math.max(this.getLeft(), other.getLeft()));
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
public boolean horizontallyOverlaps(Rectangle other) {
|
|
||||||
|
|
||||||
return horizontalOverlap(other) > 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
public float verticalOverlapRatio(Rectangle other) {
|
|
||||||
|
|
||||||
float rv = 0, delta = Math.min(this.getBottom() - this.getTop(), other.getBottom() - other.getTop());
|
|
||||||
|
|
||||||
if (other.getTop() <= this.getTop() && this.getTop() <= other.getBottom() && other.getBottom() <= this.getBottom()) {
|
|
||||||
rv = (other.getBottom() - this.getTop()) / delta;
|
|
||||||
} else if (this.getTop() <= other.getTop() && other.getTop() <= this.getBottom() && this.getBottom() <= other.getBottom()) {
|
|
||||||
rv = (this.getBottom() - other.getTop()) / delta;
|
|
||||||
} else if (this.getTop() <= other.getTop() && other.getTop() <= other.getBottom() && other.getBottom() <= this.getBottom()) {
|
|
||||||
rv = (other.getBottom() - other.getTop()) / delta;
|
|
||||||
} else if (other.getTop() <= this.getTop() && this.getTop() <= this.getBottom() && this.getBottom() <= other.getBottom()) {
|
|
||||||
rv = (this.getBottom() - this.getTop()) / delta;
|
|
||||||
}
|
|
||||||
|
|
||||||
return rv;
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
public float overlapRatio(Rectangle other) {
|
|
||||||
|
|
||||||
double intersectionWidth = Math.max(0, Math.min(this.getRight(), other.getRight()) - Math.max(this.getLeft(), other.getLeft()));
|
|
||||||
double intersectionHeight = Math.max(0, Math.min(this.getBottom(), other.getBottom()) - Math.max(this.getTop(), other.getTop()));
|
|
||||||
double intersectionArea = Math.max(0, intersectionWidth * intersectionHeight);
|
|
||||||
double unionArea = this.getArea() + other.getArea() - intersectionArea;
|
|
||||||
|
|
||||||
return (float) (intersectionArea / unionArea);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
public Rectangle merge(Rectangle other) {
|
|
||||||
|
|
||||||
this.setRect(this.createUnion(other));
|
|
||||||
return this;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
public float getTop() {
|
|
||||||
|
|
||||||
return (float) this.getMinY();
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
public void setTop(float top) {
|
|
||||||
|
|
||||||
float deltaHeight = top - this.y;
|
|
||||||
this.setRect(this.x, top, this.width, this.height - deltaHeight);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
public float getRight() {
|
|
||||||
|
|
||||||
return (float) this.getMaxX();
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
public void setRight(float right) {
|
|
||||||
|
|
||||||
this.setRect(this.x, this.y, right - this.x, this.height);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
public float getLeft() {
|
|
||||||
|
|
||||||
return (float) this.getMinX();
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
public void setLeft(float left) {
|
|
||||||
|
|
||||||
float deltaWidth = left - this.x;
|
|
||||||
this.setRect(left, this.y, this.width - deltaWidth, this.height);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
public float getBottom() {
|
|
||||||
|
|
||||||
return (float) this.getMaxY();
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
public void setBottom(float bottom) {
|
|
||||||
|
|
||||||
this.setRect(this.x, this.y, this.width, bottom - this.y);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
public Point2D[] getPoints() {
|
|
||||||
|
|
||||||
return new Point2D[]{new Point2D.Float(this.getLeft(), this.getTop()), new Point2D.Float(this.getRight(), this.getTop()), new Point2D.Float(this.getRight(),
|
|
||||||
this.getBottom()), new Point2D.Float(this.getLeft(), this.getBottom())};
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public String toString() {
|
|
||||||
|
|
||||||
StringBuilder sb = new StringBuilder();
|
|
||||||
String s = super.toString();
|
|
||||||
sb.append(s.substring(0, s.length() - 1));
|
|
||||||
sb.append(String.format(",bottom=%f,right=%f]", this.getBottom(), this.getRight()));
|
|
||||||
return sb.toString();
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
||||||
@ -4,16 +4,14 @@ import java.awt.geom.Line2D;
|
|||||||
import java.awt.geom.Point2D;
|
import java.awt.geom.Point2D;
|
||||||
import java.awt.geom.Rectangle2D;
|
import java.awt.geom.Rectangle2D;
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
import java.util.Collections;
|
|
||||||
import java.util.Comparator;
|
|
||||||
import java.util.Formatter;
|
import java.util.Formatter;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.Map;
|
|
||||||
import java.util.TreeMap;
|
|
||||||
|
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.utils.CohenSutherlandClipping;
|
import com.knecon.fforesight.service.layoutparser.processor.utils.CohenSutherlandClipping;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.utils.DoubleComparisons;
|
import com.knecon.fforesight.service.layoutparser.processor.utils.DoubleComparisons;
|
||||||
|
|
||||||
|
import lombok.Getter;
|
||||||
|
import lombok.Setter;
|
||||||
import lombok.extern.slf4j.Slf4j;
|
import lombok.extern.slf4j.Slf4j;
|
||||||
|
|
||||||
@Slf4j
|
@Slf4j
|
||||||
@ -23,10 +21,24 @@ public class Ruling extends Line2D.Float {
|
|||||||
public static final int PERPENDICULAR_UNIT_EXPAND_AMOUNT = 2;
|
public static final int PERPENDICULAR_UNIT_EXPAND_AMOUNT = 2;
|
||||||
public static final int COLINEAR_OR_PARALLEL_UNIT_EXPAND_AMOUNT = 2;
|
public static final int COLINEAR_OR_PARALLEL_UNIT_EXPAND_AMOUNT = 2;
|
||||||
|
|
||||||
|
public enum Classification {
|
||||||
|
TABLE_LINE,
|
||||||
|
UNDERLINE,
|
||||||
|
STRIKETROUGH,
|
||||||
|
HEADER_SEPARATOR,
|
||||||
|
FOOTER_SEPARATOR,
|
||||||
|
OTHER
|
||||||
|
}
|
||||||
|
|
||||||
|
@Getter
|
||||||
|
@Setter
|
||||||
|
private Classification classification;
|
||||||
|
|
||||||
|
|
||||||
public Ruling(Point2D p1, Point2D p2) {
|
public Ruling(Point2D p1, Point2D p2) {
|
||||||
|
|
||||||
super(p1, p2);
|
super(p1, p2);
|
||||||
|
this.classification = Classification.OTHER;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@ -60,126 +72,32 @@ public class Ruling extends Line2D.Float {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
// log(n) implementation of find_intersections
|
public void assertHorizontal() {
|
||||||
// based on http://people.csail.mit.edu/indyk/6.838-old/handouts/lec2.pdf
|
|
||||||
public static Map<Point2D, Ruling[]> findIntersections(List<Ruling> horizontals, List<Ruling> verticals) {
|
|
||||||
|
|
||||||
class SortObject {
|
|
||||||
|
|
||||||
protected SOType type;
|
|
||||||
protected float position;
|
|
||||||
protected Ruling ruling;
|
|
||||||
|
|
||||||
|
|
||||||
public SortObject(SOType type, float position, Ruling ruling) {
|
|
||||||
|
|
||||||
this.type = type;
|
|
||||||
this.position = position;
|
|
||||||
this.ruling = ruling;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
if (isHorizontal()) {
|
||||||
|
return;
|
||||||
}
|
}
|
||||||
|
throw new IllegalArgumentException("Ruling " + this + " is not horizontal");
|
||||||
List<SortObject> sos = new ArrayList<>();
|
|
||||||
|
|
||||||
TreeMap<Ruling, Boolean> tree = new TreeMap<>(new Comparator<Ruling>() {
|
|
||||||
@Override
|
|
||||||
public int compare(Ruling o1, Ruling o2) {
|
|
||||||
|
|
||||||
return java.lang.Double.compare(o1.getTop(), o2.getTop());
|
|
||||||
}
|
|
||||||
});
|
|
||||||
|
|
||||||
TreeMap<Point2D, Ruling[]> rv = new TreeMap<>(new Comparator<Point2D>() {
|
|
||||||
@Override
|
|
||||||
public int compare(Point2D o1, Point2D o2) {
|
|
||||||
|
|
||||||
if (o1.getY() > o2.getY()) {
|
|
||||||
return 1;
|
|
||||||
}
|
|
||||||
if (o1.getY() < o2.getY()) {
|
|
||||||
return -1;
|
|
||||||
}
|
|
||||||
if (o1.getX() > o2.getX()) {
|
|
||||||
return 1;
|
|
||||||
}
|
|
||||||
if (o1.getX() < o2.getX()) {
|
|
||||||
return -1;
|
|
||||||
}
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
});
|
|
||||||
|
|
||||||
for (Ruling h : horizontals) {
|
|
||||||
sos.add(new SortObject(SOType.HLEFT, h.getLeft() - PERPENDICULAR_UNIT_EXPAND_AMOUNT, h));
|
|
||||||
sos.add(new SortObject(SOType.HRIGHT, h.getRight() + PERPENDICULAR_UNIT_EXPAND_AMOUNT, h));
|
|
||||||
}
|
|
||||||
|
|
||||||
for (Ruling v : verticals) {
|
|
||||||
sos.add(new SortObject(SOType.VERTICAL, v.getLeft(), v));
|
|
||||||
}
|
|
||||||
|
|
||||||
Collections.sort(sos, new Comparator<SortObject>() {
|
|
||||||
@Override
|
|
||||||
public int compare(SortObject a, SortObject b) {
|
|
||||||
|
|
||||||
int rv;
|
|
||||||
if (DoubleComparisons.feq(a.position, b.position)) {
|
|
||||||
if (a.type == SOType.VERTICAL && b.type == SOType.HLEFT) {
|
|
||||||
rv = 1;
|
|
||||||
} else if (a.type == SOType.VERTICAL && b.type == SOType.HRIGHT) {
|
|
||||||
rv = -1;
|
|
||||||
} else if (a.type == SOType.HLEFT && b.type == SOType.VERTICAL) {
|
|
||||||
rv = -1;
|
|
||||||
} else if (a.type == SOType.HRIGHT && b.type == SOType.VERTICAL) {
|
|
||||||
rv = 1;
|
|
||||||
} else {
|
|
||||||
rv = java.lang.Double.compare(a.position, b.position);
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
return java.lang.Double.compare(a.position, b.position);
|
|
||||||
}
|
|
||||||
return rv;
|
|
||||||
}
|
|
||||||
});
|
|
||||||
|
|
||||||
for (SortObject so : sos) {
|
|
||||||
switch (so.type) {
|
|
||||||
case VERTICAL:
|
|
||||||
for (Map.Entry<Ruling, Boolean> h : tree.entrySet()) {
|
|
||||||
try {
|
|
||||||
Point2D i = h.getKey().intersectionPoint(so.ruling);
|
|
||||||
if (i == null) {
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
rv.put(i, new Ruling[]{h.getKey().expand(PERPENDICULAR_UNIT_EXPAND_AMOUNT), so.ruling.expand(PERPENDICULAR_UNIT_EXPAND_AMOUNT)});
|
|
||||||
} catch (UnsupportedOperationException e) {
|
|
||||||
log.info("Some line are oblique, ignoring...");
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
break;
|
|
||||||
case HRIGHT:
|
|
||||||
tree.remove(so.ruling);
|
|
||||||
break;
|
|
||||||
case HLEFT:
|
|
||||||
tree.put(so.ruling, true);
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return rv;
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
public boolean vertical() {
|
public void assertVertical() {
|
||||||
|
|
||||||
|
if (isVertical()) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
throw new IllegalArgumentException("Ruling " + this + " is not vertical");
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public boolean isVertical() {
|
||||||
|
|
||||||
return this.length() > 0 && DoubleComparisons.feq(this.x1, this.x2); //diff < ORIENTATION_CHECK_THRESHOLD;
|
return this.length() > 0 && DoubleComparisons.feq(this.x1, this.x2); //diff < ORIENTATION_CHECK_THRESHOLD;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
public boolean horizontal() {
|
public boolean isHorizontal() {
|
||||||
|
|
||||||
return this.length() > 0 && DoubleComparisons.feq(this.y1, this.y2); //diff < ORIENTATION_CHECK_THRESHOLD;
|
return this.length() > 0 && DoubleComparisons.feq(this.y1, this.y2); //diff < ORIENTATION_CHECK_THRESHOLD;
|
||||||
}
|
}
|
||||||
@ -188,36 +106,36 @@ public class Ruling extends Line2D.Float {
|
|||||||
// these are used to have a single collapse method (in page, currently)
|
// these are used to have a single collapse method (in page, currently)
|
||||||
|
|
||||||
|
|
||||||
public boolean oblique() {
|
public boolean isOblique() {
|
||||||
|
|
||||||
return !(this.vertical() || this.horizontal());
|
return !(this.isVertical() || this.isHorizontal());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
public float getPosition() {
|
public float getPosition() {
|
||||||
|
|
||||||
if (this.oblique()) {
|
if (this.isOblique()) {
|
||||||
throw new UnsupportedOperationException();
|
throw new UnsupportedOperationException();
|
||||||
}
|
}
|
||||||
return this.vertical() ? this.getLeft() : this.getTop();
|
return this.isVertical() ? this.getLeft() : this.getTop();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
public float getStart() {
|
public float getStart() {
|
||||||
|
|
||||||
if (this.oblique()) {
|
if (this.isOblique()) {
|
||||||
throw new UnsupportedOperationException();
|
throw new UnsupportedOperationException();
|
||||||
}
|
}
|
||||||
return this.vertical() ? this.getTop() : this.getLeft();
|
return this.isVertical() ? this.getTop() : this.getLeft();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
public void setStart(float v) {
|
public void setStart(float v) {
|
||||||
|
|
||||||
if (this.oblique()) {
|
if (this.isOblique()) {
|
||||||
throw new UnsupportedOperationException();
|
throw new UnsupportedOperationException();
|
||||||
}
|
}
|
||||||
if (this.vertical()) {
|
if (this.isVertical()) {
|
||||||
this.setTop(v);
|
this.setTop(v);
|
||||||
} else {
|
} else {
|
||||||
this.setLeft(v);
|
this.setLeft(v);
|
||||||
@ -227,19 +145,19 @@ public class Ruling extends Line2D.Float {
|
|||||||
|
|
||||||
public float getEnd() {
|
public float getEnd() {
|
||||||
|
|
||||||
if (this.oblique()) {
|
if (this.isOblique()) {
|
||||||
throw new UnsupportedOperationException();
|
throw new UnsupportedOperationException();
|
||||||
}
|
}
|
||||||
return this.vertical() ? this.getBottom() : this.getRight();
|
return this.isVertical() ? this.getBottom() : this.getRight();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
public void setEnd(float v) {
|
public void setEnd(float v) {
|
||||||
|
|
||||||
if (this.oblique()) {
|
if (this.isOblique()) {
|
||||||
throw new UnsupportedOperationException();
|
throw new UnsupportedOperationException();
|
||||||
}
|
}
|
||||||
if (this.vertical()) {
|
if (this.isVertical()) {
|
||||||
this.setBottom(v);
|
this.setBottom(v);
|
||||||
} else {
|
} else {
|
||||||
this.setRight(v);
|
this.setRight(v);
|
||||||
@ -249,10 +167,10 @@ public class Ruling extends Line2D.Float {
|
|||||||
|
|
||||||
public void setStartEnd(float start, float end) {
|
public void setStartEnd(float start, float end) {
|
||||||
|
|
||||||
if (this.oblique()) {
|
if (this.isOblique()) {
|
||||||
throw new UnsupportedOperationException();
|
throw new UnsupportedOperationException();
|
||||||
}
|
}
|
||||||
if (this.vertical()) {
|
if (this.isVertical()) {
|
||||||
this.setTop(start);
|
this.setTop(start);
|
||||||
this.setBottom(end);
|
this.setBottom(end);
|
||||||
} else {
|
} else {
|
||||||
@ -264,7 +182,7 @@ public class Ruling extends Line2D.Float {
|
|||||||
|
|
||||||
public boolean perpendicularTo(Ruling other) {
|
public boolean perpendicularTo(Ruling other) {
|
||||||
|
|
||||||
return this.vertical() == other.horizontal();
|
return this.isVertical() == other.isHorizontal();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@ -318,30 +236,6 @@ public class Ruling extends Line2D.Float {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
public Point2D intersectionPoint(Ruling other) {
|
|
||||||
|
|
||||||
Ruling this_l = this.expand(PERPENDICULAR_UNIT_EXPAND_AMOUNT);
|
|
||||||
Ruling other_l = other.expand(PERPENDICULAR_UNIT_EXPAND_AMOUNT);
|
|
||||||
Ruling horizontal, vertical;
|
|
||||||
|
|
||||||
if (!this_l.intersectsLine(other_l)) {
|
|
||||||
return null;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (this_l.horizontal() && other_l.vertical()) {
|
|
||||||
horizontal = this_l;
|
|
||||||
vertical = other_l;
|
|
||||||
} else if (this_l.vertical() && other_l.horizontal()) {
|
|
||||||
vertical = this_l;
|
|
||||||
horizontal = other_l;
|
|
||||||
} else {
|
|
||||||
log.warn("lines must be orthogonal, vertical and horizontal");
|
|
||||||
return null;
|
|
||||||
}
|
|
||||||
return new Point2D.Float(vertical.getLeft(), horizontal.getTop());
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public boolean equals(Object other) {
|
public boolean equals(Object other) {
|
||||||
|
|
||||||
@ -451,16 +345,9 @@ public class Ruling extends Line2D.Float {
|
|||||||
|
|
||||||
final float TOLERANCE = 1;
|
final float TOLERANCE = 1;
|
||||||
return Math.abs(ruling.getX1() - x1) < TOLERANCE &&//
|
return Math.abs(ruling.getX1() - x1) < TOLERANCE &&//
|
||||||
Math.abs(ruling.getY1() - y1) < TOLERANCE &&//
|
Math.abs(ruling.getY1() - y1) < TOLERANCE &&//
|
||||||
Math.abs(ruling.getX2() - x2) < TOLERANCE &&//
|
Math.abs(ruling.getX2() - x2) < TOLERANCE &&//
|
||||||
Math.abs(ruling.getY2() - y2) < TOLERANCE;
|
Math.abs(ruling.getY2() - y2) < TOLERANCE;
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
private enum SOType {
|
|
||||||
VERTICAL,
|
|
||||||
HRIGHT,
|
|
||||||
HLEFT
|
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|||||||
@ -36,14 +36,11 @@ public class TablePageBlock extends AbstractPageBlock {
|
|||||||
private List<Cell> cells;
|
private List<Cell> cells;
|
||||||
|
|
||||||
|
|
||||||
public TablePageBlock(List<Cell> cells, Rectangle area, int rotation) {
|
public TablePageBlock(List<Cell> cells, int rotation) {
|
||||||
|
|
||||||
|
setToBBoxOfComponents(cells);
|
||||||
this.cells = cells;
|
this.cells = cells;
|
||||||
addCells(cells);
|
addCells(cells);
|
||||||
minX = area.getLeft();
|
|
||||||
minY = area.getBottom();
|
|
||||||
maxX = area.getRight();
|
|
||||||
maxY = area.getTop();
|
|
||||||
classification = PageBlockType.TABLE;
|
classification = PageBlockType.TABLE;
|
||||||
this.rotation = rotation;
|
this.rotation = rotation;
|
||||||
}
|
}
|
||||||
@ -230,15 +227,15 @@ public class TablePageBlock extends AbstractPageBlock {
|
|||||||
return new ArrayList<>();
|
return new ArrayList<>();
|
||||||
}
|
}
|
||||||
|
|
||||||
Set<Float> uniqueX = new HashSet<>();
|
Set<Double> uniqueX = new HashSet<>();
|
||||||
Set<Float> uniqueY = new HashSet<>();
|
Set<Double> uniqueY = new HashSet<>();
|
||||||
cells.stream()
|
cells.stream()
|
||||||
.filter(c -> !c.getTextBlocks().isEmpty() || c.getHeight() > 3 && c.getWidth() > 3)
|
.filter(c -> !c.getTextBlocks().isEmpty() || c.getHeight() > 3 && c.getWidth() > 3)
|
||||||
.forEach(c -> {
|
.forEach(c -> {
|
||||||
uniqueX.add(c.getLeft());
|
uniqueX.add(c.getPdfMinX());
|
||||||
uniqueX.add(c.getRight());
|
uniqueX.add(c.getPdfMaxX());
|
||||||
uniqueY.add(c.getBottom());
|
uniqueY.add(c.getPdfMinY());
|
||||||
uniqueY.add(c.getTop());
|
uniqueY.add(c.getPdfMaxY());
|
||||||
});
|
});
|
||||||
|
|
||||||
var sortedUniqueX = uniqueX.stream()
|
var sortedUniqueX = uniqueX.stream()
|
||||||
@ -250,22 +247,24 @@ public class TablePageBlock extends AbstractPageBlock {
|
|||||||
|
|
||||||
List<List<Cell>> rowsOfCells = new ArrayList<>();
|
List<List<Cell>> rowsOfCells = new ArrayList<>();
|
||||||
|
|
||||||
Float prevY = null;
|
Double prevY = null;
|
||||||
|
|
||||||
for (Float y : sortedUniqueY) {
|
for (Double y : sortedUniqueY) {
|
||||||
|
|
||||||
List<Cell> row = new ArrayList<>();
|
List<Cell> row = new ArrayList<>();
|
||||||
|
|
||||||
Float prevX = null;
|
Double prevX = null;
|
||||||
for (Float x : sortedUniqueX) {
|
for (Double x : sortedUniqueX) {
|
||||||
|
|
||||||
if (prevY != null && prevX != null) {
|
if (prevY != null && prevX != null) {
|
||||||
var cellFromGridStructure = new Cell(new Point2D.Float(prevX, prevY), new Point2D.Float(x, y));
|
var cellFromGridStructure = new Cell(new Point2D.Double(prevX, prevY), new Point2D.Double(x, y));
|
||||||
|
|
||||||
if (cellFromGridStructure.hasMinimumSize()) {
|
if (cellFromGridStructure.hasMinimumSize()) {
|
||||||
|
|
||||||
cells.stream()
|
cells.stream()
|
||||||
.map(originalCell -> new CellWithIntersection(originalCell, RectangleTransformations.calculateIntersectedArea(cellFromGridStructure, originalCell)))
|
.map(originalCell -> new CellWithIntersection(originalCell,
|
||||||
|
RectangleTransformations.calculateIntersectedArea(cellFromGridStructure.getBBoxInitialUserSpace(),
|
||||||
|
originalCell.getBBoxInitialUserSpace())))
|
||||||
.filter(cellWithIntersection -> cellWithIntersection.intersectedArea > 0)
|
.filter(cellWithIntersection -> cellWithIntersection.intersectedArea > 0)
|
||||||
.filter(cellWithIntersection -> cellWithIntersection.originalCell.getArea() > cellWithIntersection.intersectedArea * CELL_AREA_CONTAINED_THRESHOLD)
|
.filter(cellWithIntersection -> cellWithIntersection.originalCell.getArea() > cellWithIntersection.intersectedArea * CELL_AREA_CONTAINED_THRESHOLD)
|
||||||
.max(Comparator.comparing(CellWithIntersection::intersectedArea))
|
.max(Comparator.comparing(CellWithIntersection::intersectedArea))
|
||||||
|
|||||||
@ -1,8 +1,12 @@
|
|||||||
package com.knecon.fforesight.service.layoutparser.processor.model.text;
|
package com.knecon.fforesight.service.layoutparser.processor.model.text;
|
||||||
|
|
||||||
|
import java.awt.geom.AffineTransform;
|
||||||
|
import java.awt.geom.Rectangle2D;
|
||||||
|
|
||||||
import org.apache.pdfbox.text.TextPosition;
|
import org.apache.pdfbox.text.TextPosition;
|
||||||
|
|
||||||
import com.fasterxml.jackson.annotation.JsonIgnore;
|
import com.fasterxml.jackson.annotation.JsonIgnore;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.BoundingBox;
|
||||||
|
|
||||||
import lombok.AllArgsConstructor;
|
import lombok.AllArgsConstructor;
|
||||||
import lombok.Builder;
|
import lombok.Builder;
|
||||||
@ -14,9 +18,11 @@ import lombok.SneakyThrows;
|
|||||||
@Builder
|
@Builder
|
||||||
@NoArgsConstructor
|
@NoArgsConstructor
|
||||||
@AllArgsConstructor
|
@AllArgsConstructor
|
||||||
public class RedTextPosition {
|
public class RedTextPosition extends BoundingBox {
|
||||||
|
|
||||||
private float[] position;
|
public final static int HEIGHT_PADDING = 2;
|
||||||
|
|
||||||
|
private Rectangle2D.Float bBoxDirAdj; // adjusted to text rotation
|
||||||
|
|
||||||
@JsonIgnore
|
@JsonIgnore
|
||||||
private int rotation;
|
private int rotation;
|
||||||
@ -58,43 +64,71 @@ public class RedTextPosition {
|
|||||||
pos.setFontSizeInPt(textPosition.getFontSizeInPt());
|
pos.setFontSizeInPt(textPosition.getFontSizeInPt());
|
||||||
pos.setFontName(textPosition.getFont().getName());
|
pos.setFontName(textPosition.getFont().getName());
|
||||||
|
|
||||||
var position = new float[4];
|
//TODO: There is a mismatch in the java coords of the text and the rulings,
|
||||||
|
// I guess if we start with the initial user space positions and transform them the same way we do the rulings it would work.
|
||||||
|
pos.setBBox(new Rectangle2D.Float(textPosition.getX(), textPosition.getY(), textPosition.getWidthDirAdj(), textPosition.getHeight()));
|
||||||
|
|
||||||
position[0] = textPosition.getXDirAdj();
|
float textHeight = textPosition.getHeight() + HEIGHT_PADDING;
|
||||||
position[1] = textPosition.getYDirAdj();
|
Rectangle2D.Float dirAdjPosition = new Rectangle2D.Float(textPosition.getXDirAdj(),
|
||||||
position[2] = textPosition.getWidthDirAdj();
|
textPosition.getYDirAdj() - textHeight,
|
||||||
position[3] = textPosition.getHeightDir();
|
textPosition.getWidthDirAdj(),
|
||||||
|
textHeight + HEIGHT_PADDING);
|
||||||
|
pos.setBBoxDirAdj(dirAdjPosition);
|
||||||
|
|
||||||
|
AffineTransform affineTransform = getRotationMatrix(TextDirection.fromDegrees(textPosition.getDir()), textPosition.getPageWidth(), textPosition.getPageHeight());
|
||||||
|
Rectangle2D bBoxInitialUserSpace = affineTransform.createTransformedShape(dirAdjPosition).getBounds2D();
|
||||||
|
|
||||||
|
pos.setBBoxInitialUserSpace(bBoxInitialUserSpace); // These are definitely correct
|
||||||
|
|
||||||
pos.setPosition(position);
|
|
||||||
return pos;
|
return pos;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
private static AffineTransform getRotationMatrix(TextDirection textDirection, float pageWidth, float pageHeight) {
|
||||||
|
|
||||||
|
AffineTransform transform = new AffineTransform();
|
||||||
|
|
||||||
|
if (textDirection == TextDirection.ZERO || textDirection == TextDirection.HALF_CIRCLE) {
|
||||||
|
transform.rotate(textDirection.getRadians(), pageWidth / 2f, pageHeight / 2f);
|
||||||
|
transform.translate(0f, pageHeight);
|
||||||
|
} else if (textDirection == TextDirection.QUARTER_CIRCLE) {
|
||||||
|
transform.rotate(textDirection.getRadians(), pageWidth / 2f, pageWidth / 2f);
|
||||||
|
transform.translate(0f, pageWidth);
|
||||||
|
} else {
|
||||||
|
transform.rotate(textDirection.getRadians(), pageHeight / 2f, pageHeight / 2f);
|
||||||
|
transform.translate(0f, pageWidth);
|
||||||
|
}
|
||||||
|
transform.scale(1., -1.);
|
||||||
|
return transform;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
@JsonIgnore
|
@JsonIgnore
|
||||||
public float getXDirAdj() {
|
public float getXDirAdj() {
|
||||||
|
|
||||||
return position[0];
|
return this.bBoxDirAdj.x;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@JsonIgnore
|
@JsonIgnore
|
||||||
public float getYDirAdj() {
|
public float getYDirAdj() {
|
||||||
|
|
||||||
return position[1];
|
return this.bBoxDirAdj.y;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@JsonIgnore
|
@JsonIgnore
|
||||||
public float getWidthDirAdj() {
|
public float getWidthDirAdj() {
|
||||||
|
|
||||||
return position[2];
|
return this.bBoxDirAdj.width;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@JsonIgnore
|
@JsonIgnore
|
||||||
public float getHeightDir() {
|
public float getHeightDir() {
|
||||||
|
|
||||||
return position[3];
|
return this.bBoxDirAdj.height;
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|||||||
@ -1,16 +1,13 @@
|
|||||||
package com.knecon.fforesight.service.layoutparser.processor.model.text;
|
package com.knecon.fforesight.service.layoutparser.processor.model.text;
|
||||||
|
|
||||||
import static java.util.stream.Collectors.toSet;
|
import java.awt.geom.Rectangle2D;
|
||||||
|
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
import java.util.Comparator;
|
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
|
||||||
import com.fasterxml.jackson.annotation.JsonIgnore;
|
import com.fasterxml.jackson.annotation.JsonIgnore;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
|
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.FloatFrequencyCounter;
|
import com.knecon.fforesight.service.layoutparser.processor.model.FloatFrequencyCounter;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.PageBlockType;
|
import com.knecon.fforesight.service.layoutparser.processor.model.PageBlockType;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.utils.DoubleComparisons;
|
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.utils.TextNormalizationUtilities;
|
import com.knecon.fforesight.service.layoutparser.processor.utils.TextNormalizationUtilities;
|
||||||
|
|
||||||
import lombok.AllArgsConstructor;
|
import lombok.AllArgsConstructor;
|
||||||
@ -29,34 +26,31 @@ public class TextPageBlock extends AbstractPageBlock {
|
|||||||
@Builder.Default
|
@Builder.Default
|
||||||
private List<TextPositionSequence> sequences = new ArrayList<>();
|
private List<TextPositionSequence> sequences = new ArrayList<>();
|
||||||
|
|
||||||
@JsonIgnore
|
|
||||||
private int rotation;
|
|
||||||
|
|
||||||
@JsonIgnore
|
|
||||||
private String mostPopularWordFont;
|
private String mostPopularWordFont;
|
||||||
|
|
||||||
@JsonIgnore
|
|
||||||
private String mostPopularWordStyle;
|
private String mostPopularWordStyle;
|
||||||
|
|
||||||
@JsonIgnore
|
|
||||||
private float mostPopularWordFontSize;
|
private float mostPopularWordFontSize;
|
||||||
|
|
||||||
@JsonIgnore
|
|
||||||
private float mostPopularWordHeight;
|
private float mostPopularWordHeight;
|
||||||
|
|
||||||
@JsonIgnore
|
|
||||||
private float mostPopularWordSpaceWidth;
|
private float mostPopularWordSpaceWidth;
|
||||||
|
|
||||||
@JsonIgnore
|
|
||||||
private float highestFontSize;
|
private float highestFontSize;
|
||||||
|
|
||||||
@JsonIgnore
|
|
||||||
private PageBlockType classification;
|
private PageBlockType classification;
|
||||||
|
|
||||||
@JsonIgnore
|
|
||||||
private boolean toDuplicate;
|
private boolean toDuplicate;
|
||||||
|
|
||||||
|
|
||||||
|
public TextPageBlock(List<TextPositionSequence> sequences) {
|
||||||
|
|
||||||
|
this.sequences = sequences;
|
||||||
|
calculateFrequencyCounters();
|
||||||
|
calculateBBox();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
@JsonIgnore
|
@JsonIgnore
|
||||||
public TextDirection getDir() {
|
public TextDirection getDir() {
|
||||||
|
|
||||||
@ -64,6 +58,17 @@ public class TextPageBlock extends AbstractPageBlock {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private void calculateBBox() {
|
||||||
|
|
||||||
|
if (sequences == null) {
|
||||||
|
this.bBox = new Rectangle2D.Double();
|
||||||
|
this.bBoxInitialUserSpace = new Rectangle2D.Double();
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
setToBBoxOfComponents(sequences);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
@JsonIgnore
|
@JsonIgnore
|
||||||
public float getPageHeight() {
|
public float getPageHeight() {
|
||||||
|
|
||||||
@ -80,18 +85,28 @@ public class TextPageBlock extends AbstractPageBlock {
|
|||||||
|
|
||||||
public static TextPageBlock merge(List<TextPageBlock> textBlocksToMerge) {
|
public static TextPageBlock merge(List<TextPageBlock> textBlocksToMerge) {
|
||||||
|
|
||||||
|
if (textBlocksToMerge.isEmpty()) {
|
||||||
|
throw new IllegalArgumentException("Need to provide at least one TextPageBlock.");
|
||||||
|
}
|
||||||
|
if (textBlocksToMerge.stream()
|
||||||
|
.map(AbstractPageBlock::getPage)
|
||||||
|
.distinct()
|
||||||
|
.count() != 1) {
|
||||||
|
throw new IllegalArgumentException("Cannot merge textBlocks on different pages.");
|
||||||
|
}
|
||||||
|
|
||||||
List<TextPositionSequence> sequences = textBlocksToMerge.stream()
|
List<TextPositionSequence> sequences = textBlocksToMerge.stream()
|
||||||
.map(TextPageBlock::getSequences)
|
.map(TextPageBlock::getSequences)
|
||||||
.flatMap(java.util.Collection::stream)
|
.flatMap(java.util.Collection::stream)
|
||||||
.toList();
|
.toList();
|
||||||
sequences = new ArrayList<>(sequences);
|
sequences = new ArrayList<>(sequences);
|
||||||
return fromTextPositionSequences(sequences);
|
|
||||||
|
return new TextPageBlock(sequences);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
public static TextPageBlock fromTextPositionSequences(List<TextPositionSequence> wordBlockList) {
|
|
||||||
|
|
||||||
TextPageBlock textBlock = null;
|
private void calculateFrequencyCounters() {
|
||||||
|
|
||||||
FloatFrequencyCounter lineHeightFrequencyCounter = new FloatFrequencyCounter();
|
FloatFrequencyCounter lineHeightFrequencyCounter = new FloatFrequencyCounter();
|
||||||
FloatFrequencyCounter fontSizeFrequencyCounter = new FloatFrequencyCounter();
|
FloatFrequencyCounter fontSizeFrequencyCounter = new FloatFrequencyCounter();
|
||||||
@ -99,7 +114,7 @@ public class TextPageBlock extends AbstractPageBlock {
|
|||||||
StringFrequencyCounter fontFrequencyCounter = new StringFrequencyCounter();
|
StringFrequencyCounter fontFrequencyCounter = new StringFrequencyCounter();
|
||||||
StringFrequencyCounter styleFrequencyCounter = new StringFrequencyCounter();
|
StringFrequencyCounter styleFrequencyCounter = new StringFrequencyCounter();
|
||||||
|
|
||||||
for (TextPositionSequence wordBlock : wordBlockList) {
|
for (TextPositionSequence wordBlock : sequences) {
|
||||||
|
|
||||||
lineHeightFrequencyCounter.add(wordBlock.getTextHeight());
|
lineHeightFrequencyCounter.add(wordBlock.getTextHeight());
|
||||||
fontSizeFrequencyCounter.add(wordBlock.getFontSize());
|
fontSizeFrequencyCounter.add(wordBlock.getFontSize());
|
||||||
@ -107,161 +122,23 @@ public class TextPageBlock extends AbstractPageBlock {
|
|||||||
fontFrequencyCounter.add(wordBlock.getFont());
|
fontFrequencyCounter.add(wordBlock.getFont());
|
||||||
styleFrequencyCounter.add(wordBlock.getFontStyle());
|
styleFrequencyCounter.add(wordBlock.getFontStyle());
|
||||||
|
|
||||||
if (textBlock == null) {
|
|
||||||
textBlock = new TextPageBlock(wordBlock.getMinXDirAdj(),
|
|
||||||
wordBlock.getMaxXDirAdj(),
|
|
||||||
wordBlock.getMinYDirAdj(),
|
|
||||||
wordBlock.getMaxYDirAdj(),
|
|
||||||
wordBlockList,
|
|
||||||
wordBlock.getRotation());
|
|
||||||
} else {
|
|
||||||
TextPageBlock spatialEntity = textBlock.union(wordBlock);
|
|
||||||
textBlock.resize(spatialEntity.getMinX(), spatialEntity.getMinY(), spatialEntity.getWidth(), spatialEntity.getHeight());
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if (textBlock != null) {
|
setMostPopularWordFont(fontFrequencyCounter.getMostPopular());
|
||||||
textBlock.setMostPopularWordFont(fontFrequencyCounter.getMostPopular());
|
setMostPopularWordStyle(styleFrequencyCounter.getMostPopular());
|
||||||
textBlock.setMostPopularWordStyle(styleFrequencyCounter.getMostPopular());
|
setMostPopularWordFontSize(fontSizeFrequencyCounter.getMostPopular());
|
||||||
textBlock.setMostPopularWordFontSize(fontSizeFrequencyCounter.getMostPopular());
|
setMostPopularWordHeight(lineHeightFrequencyCounter.getMostPopular());
|
||||||
textBlock.setMostPopularWordHeight(lineHeightFrequencyCounter.getMostPopular());
|
setMostPopularWordSpaceWidth(spaceFrequencyCounter.getMostPopular());
|
||||||
textBlock.setMostPopularWordSpaceWidth(spaceFrequencyCounter.getMostPopular());
|
setHighestFontSize(fontSizeFrequencyCounter.getHighest());
|
||||||
textBlock.setHighestFontSize(fontSizeFrequencyCounter.getHighest());
|
|
||||||
}
|
|
||||||
|
|
||||||
if (textBlock != null
|
|
||||||
&& textBlock.getSequences() != null
|
|
||||||
&& textBlock.getSequences()
|
|
||||||
.stream()
|
|
||||||
.map(t -> DoubleComparisons.round(t.getMinYDirAdj(), 3))
|
|
||||||
.collect(toSet()).size() == 1) {
|
|
||||||
textBlock.getSequences().sort(Comparator.comparing(TextPositionSequence::getMinXDirAdj));
|
|
||||||
}
|
|
||||||
return textBlock;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Returns the minX value in pdf coordinate system.
|
|
||||||
* Note: This needs to use Pdf Coordinate System where {0,0} rotated with the page rotation.
|
|
||||||
* 0 -> LowerLeft
|
|
||||||
* 90 -> UpperLeft
|
|
||||||
* 180 -> UpperRight
|
|
||||||
* 270 -> LowerRight
|
|
||||||
*
|
|
||||||
* @return the minX value in pdf coordinate system
|
|
||||||
*/
|
|
||||||
@JsonIgnore
|
|
||||||
public float getPdfMinX() {
|
|
||||||
|
|
||||||
if (getDir().getDegrees() == 90) {
|
|
||||||
return minY;
|
|
||||||
} else if (getDir().getDegrees() == 180) {
|
|
||||||
return getPageWidth() - maxX;
|
|
||||||
|
|
||||||
} else if (getDir().getDegrees() == 270) {
|
|
||||||
|
|
||||||
return getPageWidth() - maxY;
|
|
||||||
} else {
|
|
||||||
return minX;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Returns the maxX value in pdf coordinate system.
|
|
||||||
* Note: This needs to use Pdf Coordinate System where {0,0} rotated with the page rotation.
|
|
||||||
* 0 -> LowerLeft
|
|
||||||
* 90 -> UpperLeft
|
|
||||||
* 180 -> UpperRight
|
|
||||||
* 270 -> LowerRight
|
|
||||||
*
|
|
||||||
* @return the maxX value in pdf coordinate system
|
|
||||||
*/
|
|
||||||
@JsonIgnore
|
|
||||||
public float getPdfMaxX() {
|
|
||||||
|
|
||||||
if (getDir().getDegrees() == 90) {
|
|
||||||
return maxY;
|
|
||||||
} else if (getDir().getDegrees() == 180) {
|
|
||||||
return getPageWidth() - minX;
|
|
||||||
} else if (getDir().getDegrees() == 270) {
|
|
||||||
return getPageWidth() - minY;
|
|
||||||
|
|
||||||
} else {
|
|
||||||
return maxX;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Returns the minY value in pdf coordinate system.
|
|
||||||
* Note: This needs to use Pdf Coordinate System where {0,0} rotated with the page rotation.
|
|
||||||
* 0 -> LowerLeft
|
|
||||||
* 90 -> UpperLeft
|
|
||||||
* 180 -> UpperRight
|
|
||||||
* 270 -> LowerRight
|
|
||||||
*
|
|
||||||
* @return the minY value in pdf coordinate system
|
|
||||||
*/
|
|
||||||
@JsonIgnore
|
|
||||||
public float getPdfMinY() {
|
|
||||||
|
|
||||||
if (getDir().getDegrees() == 90) {
|
|
||||||
return minX;
|
|
||||||
} else if (getDir().getDegrees() == 180) {
|
|
||||||
return maxY;
|
|
||||||
|
|
||||||
} else if (getDir().getDegrees() == 270) {
|
|
||||||
return getPageHeight() - maxX;
|
|
||||||
|
|
||||||
} else {
|
|
||||||
return getPageHeight() - maxY;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Returns the maxY value in pdf coordinate system.
|
|
||||||
* Note: This needs to use Pdf Coordinate System where {0,0} rotated with the page rotation.
|
|
||||||
* 0 -> LowerLeft
|
|
||||||
* 90 -> UpperLeft
|
|
||||||
* 180 -> UpperRight
|
|
||||||
* 270 -> LowerRight
|
|
||||||
*
|
|
||||||
* @return the maxY value in pdf coordinate system
|
|
||||||
*/
|
|
||||||
@JsonIgnore
|
|
||||||
public float getPdfMaxY() {
|
|
||||||
|
|
||||||
if (getDir().getDegrees() == 90) {
|
|
||||||
return maxX;
|
|
||||||
} else if (getDir().getDegrees() == 180) {
|
|
||||||
|
|
||||||
return minY;
|
|
||||||
} else if (getDir().getDegrees() == 270) {
|
|
||||||
return getPageHeight() - minX;
|
|
||||||
} else {
|
|
||||||
return getPageHeight() - minY;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
public TextPageBlock(float minX, float maxX, float minY, float maxY, List<TextPositionSequence> sequences, int rotation) {
|
|
||||||
|
|
||||||
this.minX = minX;
|
|
||||||
this.maxX = maxX;
|
|
||||||
this.minY = minY;
|
|
||||||
this.maxY = maxY;
|
|
||||||
this.sequences = sequences;
|
|
||||||
this.rotation = rotation;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
public TextPageBlock union(TextPositionSequence r) {
|
public TextPageBlock union(TextPositionSequence r) {
|
||||||
|
|
||||||
TextPageBlock union = this.copy();
|
TextPageBlock union = this.copy();
|
||||||
union.add(r);
|
union.getSequences().add(r);
|
||||||
|
calculateFrequencyCounters();
|
||||||
|
calculateBBox();
|
||||||
return union;
|
return union;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -269,80 +146,32 @@ public class TextPageBlock extends AbstractPageBlock {
|
|||||||
public TextPageBlock union(TextPageBlock r) {
|
public TextPageBlock union(TextPageBlock r) {
|
||||||
|
|
||||||
TextPageBlock union = this.copy();
|
TextPageBlock union = this.copy();
|
||||||
union.add(r);
|
union.getSequences().addAll(r.getSequences());
|
||||||
|
calculateFrequencyCounters();
|
||||||
|
calculateBBox();
|
||||||
return union;
|
return union;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
public void add(TextPageBlock r) {
|
public void add(TextPageBlock r) {
|
||||||
|
|
||||||
if (r.getMinX() < minX) {
|
|
||||||
minX = r.getMinX();
|
|
||||||
}
|
|
||||||
if (r.getMaxX() > maxX) {
|
|
||||||
maxX = r.getMaxX();
|
|
||||||
}
|
|
||||||
if (r.getMinY() < minY) {
|
|
||||||
minY = r.getMinY();
|
|
||||||
}
|
|
||||||
if (r.getMaxY() > maxY) {
|
|
||||||
maxY = r.getMaxY();
|
|
||||||
}
|
|
||||||
sequences.addAll(r.getSequences());
|
sequences.addAll(r.getSequences());
|
||||||
|
calculateFrequencyCounters();
|
||||||
|
calculateBBox();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
public void add(TextPositionSequence r) {
|
public void add(TextPositionSequence r) {
|
||||||
|
|
||||||
setCoordinates(r);
|
sequences.add(r);
|
||||||
|
calculateFrequencyCounters();
|
||||||
|
calculateBBox();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
public TextPageBlock copy() {
|
public TextPageBlock copy() {
|
||||||
|
|
||||||
return new TextPageBlock(minX, maxX, minY, maxY, sequences, rotation);
|
return new TextPageBlock(new ArrayList<>(sequences));
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
public void resize(float x1, float y1, float width, float height) {
|
|
||||||
|
|
||||||
set(x1, y1, x1 + width, y1 + height);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
public void resize() {
|
|
||||||
|
|
||||||
minX = Float.MAX_VALUE;
|
|
||||||
minY = Float.MAX_VALUE;
|
|
||||||
maxX = Float.MIN_VALUE;
|
|
||||||
maxY = Float.MIN_VALUE;
|
|
||||||
sequences.forEach(this::setCoordinates);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
private void setCoordinates(TextPositionSequence sequence) {
|
|
||||||
|
|
||||||
if (sequence.getMinXDirAdj() < minX) {
|
|
||||||
minX = sequence.getMinXDirAdj();
|
|
||||||
}
|
|
||||||
if (sequence.getMaxXDirAdj() > maxX) {
|
|
||||||
maxX = sequence.getMaxXDirAdj();
|
|
||||||
}
|
|
||||||
if (sequence.getMinYDirAdj() < minY) {
|
|
||||||
minY = sequence.getMinYDirAdj();
|
|
||||||
}
|
|
||||||
if (sequence.getMaxYDirAdj() > maxY) {
|
|
||||||
maxY = sequence.getMaxYDirAdj();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
public void set(float x1, float y1, float x2, float y2) {
|
|
||||||
|
|
||||||
this.minX = Math.min(x1, x2);
|
|
||||||
this.maxX = Math.max(x1, x2);
|
|
||||||
this.minY = Math.min(y1, y2);
|
|
||||||
this.maxY = Math.max(y1, y2);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@ -1,7 +1,6 @@
|
|||||||
package com.knecon.fforesight.service.layoutparser.processor.model.text;
|
package com.knecon.fforesight.service.layoutparser.processor.model.text;
|
||||||
|
|
||||||
import java.awt.geom.AffineTransform;
|
import java.awt.geom.Rectangle2D;
|
||||||
import java.awt.geom.Point2D;
|
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.Locale;
|
import java.util.Locale;
|
||||||
@ -9,15 +8,14 @@ import java.util.stream.Collectors;
|
|||||||
|
|
||||||
import org.apache.pdfbox.text.TextPosition;
|
import org.apache.pdfbox.text.TextPosition;
|
||||||
|
|
||||||
import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.Point;
|
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.BoundingBox;
|
||||||
import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.Rectangle;
|
import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations;
|
||||||
|
|
||||||
import lombok.AllArgsConstructor;
|
import lombok.AllArgsConstructor;
|
||||||
import lombok.Builder;
|
import lombok.Builder;
|
||||||
import lombok.Data;
|
import lombok.Data;
|
||||||
import lombok.EqualsAndHashCode;
|
import lombok.EqualsAndHashCode;
|
||||||
import lombok.NoArgsConstructor;
|
import lombok.NoArgsConstructor;
|
||||||
import lombok.SneakyThrows;
|
|
||||||
import lombok.extern.slf4j.Slf4j;
|
import lombok.extern.slf4j.Slf4j;
|
||||||
|
|
||||||
@Slf4j
|
@Slf4j
|
||||||
@ -25,8 +23,8 @@ import lombok.extern.slf4j.Slf4j;
|
|||||||
@Builder
|
@Builder
|
||||||
@NoArgsConstructor
|
@NoArgsConstructor
|
||||||
@AllArgsConstructor
|
@AllArgsConstructor
|
||||||
@EqualsAndHashCode(onlyExplicitlyIncluded = true)
|
@EqualsAndHashCode(onlyExplicitlyIncluded = true, callSuper = false)
|
||||||
public class TextPositionSequence implements CharSequence {
|
public class TextPositionSequence extends BoundingBox implements CharSequence {
|
||||||
|
|
||||||
public static final int HEIGHT_PADDING = 2;
|
public static final int HEIGHT_PADDING = 2;
|
||||||
|
|
||||||
@ -36,29 +34,38 @@ public class TextPositionSequence implements CharSequence {
|
|||||||
@EqualsAndHashCode.Include
|
@EqualsAndHashCode.Include
|
||||||
private List<RedTextPosition> textPositions = new ArrayList<>();
|
private List<RedTextPosition> textPositions = new ArrayList<>();
|
||||||
|
|
||||||
|
private Rectangle2D bBoxDirAdj;
|
||||||
@EqualsAndHashCode.Include
|
@EqualsAndHashCode.Include
|
||||||
private TextDirection dir;
|
private TextDirection dir;
|
||||||
private int rotation;
|
private int rotation;
|
||||||
private float pageHeight;
|
private float pageHeight;
|
||||||
private float pageWidth;
|
private float pageWidth;
|
||||||
private boolean isParagraphStart;
|
private boolean isParagraphStart;
|
||||||
|
private boolean strikethrough;
|
||||||
|
private boolean underline;
|
||||||
|
|
||||||
|
|
||||||
public TextPositionSequence(int page) {
|
public TextPositionSequence(List<TextPosition> textPositions, int pageNumber, boolean isParagraphStart) {
|
||||||
|
|
||||||
this.page = page;
|
this.textPositions = textPositions.stream()
|
||||||
}
|
.map(RedTextPosition::fromTextPosition)
|
||||||
|
.collect(Collectors.toList());
|
||||||
|
this.page = pageNumber;
|
||||||
public TextPositionSequence(List<TextPosition> textPositions, int page, boolean isParagraphStart) {
|
|
||||||
|
|
||||||
this.textPositions = textPositions.stream().map(RedTextPosition::fromTextPosition).collect(Collectors.toList());
|
|
||||||
this.page = page;
|
|
||||||
this.dir = TextDirection.fromDegrees(textPositions.get(0).getDir());
|
this.dir = TextDirection.fromDegrees(textPositions.get(0).getDir());
|
||||||
this.rotation = textPositions.get(0).getRotation();
|
this.rotation = textPositions.get(0).getRotation();
|
||||||
this.pageHeight = textPositions.get(0).getPageHeight();
|
this.pageHeight = textPositions.get(0).getPageHeight();
|
||||||
this.pageWidth = textPositions.get(0).getPageWidth();
|
this.pageWidth = textPositions.get(0).getPageWidth();
|
||||||
this.isParagraphStart = isParagraphStart;
|
this.isParagraphStart = isParagraphStart;
|
||||||
|
calculateBBox();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private void calculateBBox() {
|
||||||
|
|
||||||
|
this.bBoxDirAdj = textPositions.stream()
|
||||||
|
.map(RedTextPosition::getBBoxDirAdj)
|
||||||
|
.collect(RectangleTransformations.collectBBox());
|
||||||
|
setToBBoxOfComponents(getTextPositions());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@ -70,6 +77,7 @@ public class TextPositionSequence implements CharSequence {
|
|||||||
this.rotation = textPositions.get(0).getRotation();
|
this.rotation = textPositions.get(0).getRotation();
|
||||||
this.pageHeight = textPositions.get(0).getPageHeight();
|
this.pageHeight = textPositions.get(0).getPageHeight();
|
||||||
this.pageWidth = textPositions.get(0).getPageWidth();
|
this.pageWidth = textPositions.get(0).getPageWidth();
|
||||||
|
calculateBBox();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@ -107,7 +115,7 @@ public class TextPositionSequence implements CharSequence {
|
|||||||
textPositionSequence.rotation = rotation;
|
textPositionSequence.rotation = rotation;
|
||||||
textPositionSequence.pageHeight = pageHeight;
|
textPositionSequence.pageHeight = pageHeight;
|
||||||
textPositionSequence.pageWidth = pageWidth;
|
textPositionSequence.pageWidth = pageWidth;
|
||||||
|
textPositionSequence.setToBBoxOfComponents(getTextPositions());
|
||||||
return textPositionSequence;
|
return textPositionSequence;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -137,18 +145,18 @@ public class TextPositionSequence implements CharSequence {
|
|||||||
this.rotation = textPositionSequence.getRotation();
|
this.rotation = textPositionSequence.getRotation();
|
||||||
this.pageHeight = textPositionSequence.getPageHeight();
|
this.pageHeight = textPositionSequence.getPageHeight();
|
||||||
this.pageWidth = textPositionSequence.getPageWidth();
|
this.pageWidth = textPositionSequence.getPageWidth();
|
||||||
|
calculateBBox();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
public void add(TextPosition textPosition) {
|
public void add(TextPosition textPosition) {
|
||||||
|
|
||||||
this.textPositions.add(RedTextPosition.fromTextPosition(textPosition));
|
this.textPositions.add(RedTextPosition.fromTextPosition(textPosition));
|
||||||
|
|
||||||
this.dir = TextDirection.fromDegrees(textPositions.get(0).getDir());
|
this.dir = TextDirection.fromDegrees(textPositions.get(0).getDir());
|
||||||
this.rotation = textPositions.get(0).getRotation();
|
this.rotation = textPositions.get(0).getRotation();
|
||||||
this.pageHeight = textPositions.get(0).getPageHeight();
|
this.pageHeight = textPositions.get(0).getPageHeight();
|
||||||
this.pageWidth = textPositions.get(0).getPageWidth();
|
this.pageWidth = textPositions.get(0).getPageWidth();
|
||||||
|
calculateBBox();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@ -220,18 +228,6 @@ public class TextPositionSequence implements CharSequence {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
public float getHeight() {
|
|
||||||
|
|
||||||
return getMaxYDirAdj() - getMinYDirAdj();
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
public float getWidth() {
|
|
||||||
|
|
||||||
return getMaxXDirAdj() - getMinXDirAdj();
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
public String getFont() {
|
public String getFont() {
|
||||||
|
|
||||||
if (textPositions.get(0).getFontName() == null) {
|
if (textPositions.get(0).getFontName() == null) {
|
||||||
@ -271,54 +267,5 @@ public class TextPositionSequence implements CharSequence {
|
|||||||
return textPositions.get(0).getWidthOfSpace();
|
return textPositions.get(0).getWidthOfSpace();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
/**
|
|
||||||
* This returns the bounding box of the word in Pdf Coordinate System where {0,0} rotated with the page rotation.
|
|
||||||
* 0 -> LowerLeft
|
|
||||||
* 90 -> UpperLeft
|
|
||||||
* 180 -> UpperRight
|
|
||||||
* 270 -> LowerRight
|
|
||||||
*
|
|
||||||
* @return bounding box of the word in Pdf Coordinate System
|
|
||||||
*/
|
|
||||||
|
|
||||||
@SneakyThrows
|
|
||||||
public Rectangle getRectangle() {
|
|
||||||
|
|
||||||
log.debug("Page: '{}', Word: '{}', Rotation: '{}', textRotation {}", page, this, rotation, dir);
|
|
||||||
|
|
||||||
float textHeight = getTextHeight();
|
|
||||||
|
|
||||||
RedTextPosition firstTextPos = textPositions.get(0);
|
|
||||||
RedTextPosition lastTextPos = textPositions.get(textPositions.size() - 1);
|
|
||||||
|
|
||||||
Point2D bottomLeft = new Point2D.Double(firstTextPos.getXDirAdj(), firstTextPos.getYDirAdj() - HEIGHT_PADDING);
|
|
||||||
Point2D topRight = new Point2D.Double(lastTextPos.getXDirAdj() + lastTextPos.getWidthDirAdj(), lastTextPos.getYDirAdj() + textHeight + HEIGHT_PADDING);
|
|
||||||
|
|
||||||
AffineTransform transform = new AffineTransform();
|
|
||||||
if (dir == TextDirection.ZERO || dir == TextDirection.HALF_CIRCLE) {
|
|
||||||
transform.rotate(dir.getRadians(), pageWidth / 2f, pageHeight / 2f);
|
|
||||||
transform.translate(0f, pageHeight + textHeight);
|
|
||||||
transform.scale(1., -1.);
|
|
||||||
} else if (dir == TextDirection.QUARTER_CIRCLE) {
|
|
||||||
transform.rotate(dir.getRadians(), pageWidth / 2f, pageWidth / 2f);
|
|
||||||
transform.translate(0f, pageWidth + textHeight);
|
|
||||||
transform.scale(1., -1.);
|
|
||||||
} else {
|
|
||||||
transform.rotate(dir.getRadians(), pageHeight / 2f, pageHeight / 2f);
|
|
||||||
transform.translate(0f, pageWidth + textHeight);
|
|
||||||
transform.scale(1., -1.);
|
|
||||||
}
|
|
||||||
|
|
||||||
bottomLeft = transform.transform(bottomLeft, null);
|
|
||||||
topRight = transform.transform(topRight, null);
|
|
||||||
|
|
||||||
return new Rectangle( //
|
|
||||||
new Point((float) bottomLeft.getX(), (float) bottomLeft.getY()),
|
|
||||||
(float) (topRight.getX() - bottomLeft.getX()),
|
|
||||||
(float) (topRight.getY() - bottomLeft.getY()),
|
|
||||||
page);
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@ -9,6 +9,7 @@ import java.util.Map;
|
|||||||
|
|
||||||
import org.springframework.stereotype.Service;
|
import org.springframework.stereotype.Service;
|
||||||
|
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
|
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.ImageType;
|
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.ImageType;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.image.ClassifiedImage;
|
import com.knecon.fforesight.service.layoutparser.processor.model.image.ClassifiedImage;
|
||||||
@ -54,11 +55,12 @@ public class ImageServiceResponseAdapter {
|
|||||||
|
|
||||||
classificationPage.getImages().forEach(image -> {
|
classificationPage.getImages().forEach(image -> {
|
||||||
if (image.getImageType().equals(ImageType.OTHER)) {
|
if (image.getImageType().equals(ImageType.OTHER)) {
|
||||||
classificationPage.getTextBlocks().forEach(textblock -> {
|
for (AbstractPageBlock textblock : classificationPage.getTextBlocks()) {
|
||||||
if (image.getPosition().contains(textblock.getMinX(), textblock.getMinY(), textblock.getWidth(), textblock.getHeight())) {
|
if (image.getPosition().contains(textblock.getBBoxInitialUserSpace())) {
|
||||||
image.setImageType(ImageType.OCR);
|
image.setImageType(ImageType.OCR);
|
||||||
|
return;
|
||||||
}
|
}
|
||||||
});
|
}
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|||||||
@ -31,8 +31,9 @@ public class BodyTextFrameService {
|
|||||||
Rectangle bodyTextFrame = calculateBodyTextFrame(classificationDocument.getPages(), classificationDocument.getFontSizeCounter(), false, layoutParsingType);
|
Rectangle bodyTextFrame = calculateBodyTextFrame(classificationDocument.getPages(), classificationDocument.getFontSizeCounter(), false, layoutParsingType);
|
||||||
Rectangle landscapeBodyTextFrame = calculateBodyTextFrame(classificationDocument.getPages(), classificationDocument.getFontSizeCounter(), true, layoutParsingType);
|
Rectangle landscapeBodyTextFrame = calculateBodyTextFrame(classificationDocument.getPages(), classificationDocument.getFontSizeCounter(), true, layoutParsingType);
|
||||||
for (ClassificationPage page : classificationDocument.getPages()) {
|
for (ClassificationPage page : classificationDocument.getPages()) {
|
||||||
// var updatedBodyTextFrame = getBodyTextFrameFromRulings(page, bodyTextFrame, landscapeBodyTextFrame);
|
var updatedBodyTextFrame = getBodyTextFrameFromRulings(page, bodyTextFrame, landscapeBodyTextFrame);
|
||||||
setBodyTextFrameAdjustedToPage(page, bodyTextFrame, landscapeBodyTextFrame);
|
setBodyTextFrameAdjustedToPage(page, bodyTextFrame, landscapeBodyTextFrame);
|
||||||
|
classificationDocument.getVisualizations().addMainBodyVisualization(page.isLandscape() ? landscapeBodyTextFrame : bodyTextFrame, page.getPageNumber());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -58,24 +59,26 @@ public class BodyTextFrameService {
|
|||||||
|
|
||||||
private List<Ruling> getPotentialFooterRulings(ClassificationPage page) {
|
private List<Ruling> getPotentialFooterRulings(ClassificationPage page) {
|
||||||
|
|
||||||
return page.getCleanRulings()
|
return page.getCleanRulings().getHorizontals()
|
||||||
.getHorizontal()
|
|
||||||
.stream()
|
.stream()
|
||||||
|
.filter(ruling -> ruling.getClassification().equals(Ruling.Classification.OTHER))
|
||||||
.filter(ruling -> ruling.getY1() < page.getPageHeight() * RULING_HEIGHT_THRESHOLD)
|
.filter(ruling -> ruling.getY1() < page.getPageHeight() * RULING_HEIGHT_THRESHOLD)
|
||||||
.filter(ruling -> ruling.getWidth() > RULING_WIDTH_THRESHOLD * page.getPageWidth())
|
.filter(ruling -> ruling.getWidth() > RULING_WIDTH_THRESHOLD * page.getPageWidth())
|
||||||
.sorted(Comparator.comparingDouble(Ruling::getTop))
|
.sorted(Comparator.comparingDouble(Ruling::getTop))
|
||||||
|
.peek(ruling -> ruling.setClassification(Ruling.Classification.FOOTER_SEPARATOR))
|
||||||
.toList();
|
.toList();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
private List<Ruling> getPotentialHeaderRulings(ClassificationPage page) {
|
private List<Ruling> getPotentialHeaderRulings(ClassificationPage page) {
|
||||||
|
|
||||||
return page.getCleanRulings()
|
return page.getCleanRulings().getHorizontals()
|
||||||
.getHorizontal()
|
|
||||||
.stream()
|
.stream()
|
||||||
|
.filter(ruling -> ruling.getClassification().equals(Ruling.Classification.OTHER))
|
||||||
.filter(ruling -> ruling.getY1() > page.getPageHeight() * (1 - RULING_HEIGHT_THRESHOLD))
|
.filter(ruling -> ruling.getY1() > page.getPageHeight() * (1 - RULING_HEIGHT_THRESHOLD))
|
||||||
.filter(ruling -> ruling.getWidth() > RULING_WIDTH_THRESHOLD * page.getPageWidth())
|
.filter(ruling -> ruling.getWidth() > RULING_WIDTH_THRESHOLD * page.getPageWidth())
|
||||||
.sorted(Comparator.comparingDouble(Ruling::getBottom).reversed())
|
.sorted(Comparator.comparingDouble(Ruling::getBottom).reversed())
|
||||||
|
.peek(ruling -> ruling.setClassification(Ruling.Classification.HEADER_SEPARATOR))
|
||||||
.toList();
|
.toList();
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -99,16 +102,16 @@ public class BodyTextFrameService {
|
|||||||
|
|
||||||
if (page.getPageWidth() > page.getPageHeight() && page.getRotation() == 270) {
|
if (page.getPageWidth() > page.getPageHeight() && page.getRotation() == 270) {
|
||||||
textFrame = new Rectangle(new Point(textFrame.getTopLeft().getY(), page.getPageHeight() - textFrame.getTopLeft().getX() - textFrame.getWidth()),
|
textFrame = new Rectangle(new Point(textFrame.getTopLeft().getY(), page.getPageHeight() - textFrame.getTopLeft().getX() - textFrame.getWidth()),
|
||||||
textFrame.getHeight(),
|
textFrame.getHeight(),
|
||||||
textFrame.getWidth(),
|
textFrame.getWidth(),
|
||||||
0);
|
0);
|
||||||
} else if (page.getPageWidth() > page.getPageHeight() && page.getRotation() != 0) {
|
} else if (page.getPageWidth() > page.getPageHeight() && page.getRotation() != 0) {
|
||||||
textFrame = new Rectangle(new Point(textFrame.getTopLeft().getY(), textFrame.getTopLeft().getX()), textFrame.getHeight(), textFrame.getWidth(), page.getPageNumber());
|
textFrame = new Rectangle(new Point(textFrame.getTopLeft().getY(), textFrame.getTopLeft().getX()), textFrame.getHeight(), textFrame.getWidth(), page.getPageNumber());
|
||||||
} else if (page.getRotation() == 180) {
|
} else if (page.getRotation() == 180) {
|
||||||
textFrame = new Rectangle(new Point(textFrame.getTopLeft().getX(), page.getPageHeight() - textFrame.getTopLeft().getY() - textFrame.getHeight()),
|
textFrame = new Rectangle(new Point(textFrame.getTopLeft().getX(), page.getPageHeight() - textFrame.getTopLeft().getY() - textFrame.getHeight()),
|
||||||
textFrame.getWidth(),
|
textFrame.getWidth(),
|
||||||
textFrame.getHeight(),
|
textFrame.getHeight(),
|
||||||
0);
|
0);
|
||||||
}
|
}
|
||||||
page.setBodyTextFrame(textFrame);
|
page.setBodyTextFrame(textFrame);
|
||||||
}
|
}
|
||||||
@ -152,14 +155,17 @@ public class BodyTextFrameService {
|
|||||||
}
|
}
|
||||||
|
|
||||||
if (MarkedContentUtils.intersects(textBlock, page.getMarkedContentBboxPerType(), MarkedContentUtils.HEADER) || MarkedContentUtils.intersects(textBlock,
|
if (MarkedContentUtils.intersects(textBlock, page.getMarkedContentBboxPerType(), MarkedContentUtils.HEADER) || MarkedContentUtils.intersects(textBlock,
|
||||||
page.getMarkedContentBboxPerType(),
|
page.getMarkedContentBboxPerType(),
|
||||||
MarkedContentUtils.FOOTER)) {
|
MarkedContentUtils.FOOTER)) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
float approxLineCount = PositionUtils.getApproxLineCount(textBlock);
|
double approxLineCount = PositionUtils.getApproxLineCount(textBlock);
|
||||||
if (layoutParsingType.equals(LayoutParsingType.DOCUMINE) && approxLineCount < approximateHeaderLineCount && textBlock.getMaxY() >= page.getPageHeight() - (page.getPageHeight() / 10) || !layoutParsingType.equals(
|
if (layoutParsingType.equals(LayoutParsingType.DOCUMINE_OLD) //
|
||||||
LayoutParsingType.DOCUMINE) && approxLineCount < approximateHeaderLineCount) {
|
&& approxLineCount < approximateHeaderLineCount //
|
||||||
|
&& textBlock.getMaxY() >= page.getPageHeight() - (page.getPageHeight() / 10)//
|
||||||
|
|| !layoutParsingType.equals(LayoutParsingType.DOCUMINE_OLD) //
|
||||||
|
&& approxLineCount < approximateHeaderLineCount) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -185,10 +191,10 @@ public class BodyTextFrameService {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return new Rectangle(new Point(expansionsRectangle.minX, expansionsRectangle.minY),
|
return new Rectangle(new Point((float) expansionsRectangle.minX, (float) expansionsRectangle.minY),
|
||||||
expansionsRectangle.maxX - expansionsRectangle.minX,
|
(float) (expansionsRectangle.maxX - expansionsRectangle.minX),
|
||||||
expansionsRectangle.maxY - expansionsRectangle.minY,
|
(float) (expansionsRectangle.maxY - expansionsRectangle.minY),
|
||||||
0);
|
0);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@ -226,10 +232,10 @@ public class BodyTextFrameService {
|
|||||||
|
|
||||||
private class BodyTextFrameExpansionsRectangle {
|
private class BodyTextFrameExpansionsRectangle {
|
||||||
|
|
||||||
float minX = 10000;
|
double minX = 10000;
|
||||||
float maxX = -100;
|
double maxX = -100;
|
||||||
float minY = 10000;
|
double minY = 10000;
|
||||||
float maxY = -100;
|
double maxY = -100;
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@ -44,9 +44,9 @@ public class GapDetectionService {
|
|||||||
|
|
||||||
if (yDifference > avgTextPositionHeight * Y_GAP_FACTOR) {
|
if (yDifference > avgTextPositionHeight * Y_GAP_FACTOR) {
|
||||||
yGapContext.addGap(mainBodyTextFrame.getMinX(),
|
yGapContext.addGap(mainBodyTextFrame.getMinX(),
|
||||||
previousTextPositionBBox.getMaxY(),
|
previousTextPositionBBox.getMaxY(),
|
||||||
mainBodyTextFrame.getWidth(),
|
mainBodyTextFrame.getWidth(),
|
||||||
-(previousTextPositionBBox.getMaxY() - currentTextPositionBBox.getMinY()));
|
-(previousTextPositionBBox.getMaxY() - currentTextPositionBBox.getMinY()));
|
||||||
}
|
}
|
||||||
if (yDifference > avgTextPositionHeight * NEW_LINE_FACTOR) {
|
if (yDifference > avgTextPositionHeight * NEW_LINE_FACTOR) {
|
||||||
|
|
||||||
@ -69,32 +69,37 @@ public class GapDetectionService {
|
|||||||
|
|
||||||
private static Rectangle2D toRectangle2D(TextPositionSequence textPosition) {
|
private static Rectangle2D toRectangle2D(TextPositionSequence textPosition) {
|
||||||
|
|
||||||
return mirrorY(RectangleTransformations.toRectangle2D(textPosition.getRectangle()));
|
return mirrorY(textPosition.getBBox());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
private static Rectangle2D mirrorY(Rectangle2D rectangle2D) {
|
private static Rectangle2D mirrorY(Rectangle2D rectangle2D) {
|
||||||
|
|
||||||
return new Rectangle2D.Double(rectangle2D.getX(), Math.min(rectangle2D.getMinY(), rectangle2D.getMaxY()), rectangle2D.getWidth(), Math.abs(rectangle2D.getHeight()));
|
return new Rectangle2D.Double(rectangle2D.getX(), Math.min(rectangle2D.getMinY(), rectangle2D.getMaxY()), rectangle2D.getWidth(), Math.abs(rectangle2D.getHeight()));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
private static void addGapToLine(Rectangle2D currentTextPosition, Rectangle2D previousTextPosition, XGapsContext context) {
|
private static void addGapToLine(Rectangle2D currentTextPosition, Rectangle2D previousTextPosition, XGapsContext context) {
|
||||||
|
|
||||||
context.gapsInCurrentLine.add(new Rectangle2D.Double(previousTextPosition.getMaxX(),
|
context.gapsInCurrentLine.add(new Rectangle2D.Double(previousTextPosition.getMaxX(),
|
||||||
previousTextPosition.getMinY(),
|
previousTextPosition.getMinY(),
|
||||||
currentTextPosition.getMinX() - previousTextPosition.getMaxX(),
|
currentTextPosition.getMinX() - previousTextPosition.getMaxX(),
|
||||||
(previousTextPosition.getHeight() + currentTextPosition.getHeight()) / 2));
|
(previousTextPosition.getHeight() + currentTextPosition.getHeight()) / 2));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
private static void assertAllTextPositionsHaveSameDir(List<TextPositionSequence> textPositionSequences) {
|
private static void assertAllTextPositionsHaveSameDir(List<TextPositionSequence> textPositionSequences) {
|
||||||
|
|
||||||
assert textPositionSequences.stream().map(TextPositionSequence::getDir).allMatch(a -> a.equals(textPositionSequences.get(0).getDir()));
|
assert textPositionSequences.stream()
|
||||||
|
.map(TextPositionSequence::getDir)
|
||||||
|
.allMatch(a -> a.equals(textPositionSequences.get(0).getDir()));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
private static double getAvgTextPositionHeight(List<TextPositionSequence> textPositionSequences) {
|
private static double getAvgTextPositionHeight(List<TextPositionSequence> textPositionSequences) {
|
||||||
|
|
||||||
return textPositionSequences.stream().mapToDouble(TextPositionSequence::getHeight).average().orElseThrow();
|
return textPositionSequences.stream()
|
||||||
|
.mapToDouble(TextPositionSequence::getHeight).average().orElseThrow();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@ -142,9 +147,9 @@ public class GapDetectionService {
|
|||||||
public void addGapToRightEdgeOfMainBody(Rectangle2D textPosition) {
|
public void addGapToRightEdgeOfMainBody(Rectangle2D textPosition) {
|
||||||
|
|
||||||
Rectangle2D leftGap = new Rectangle2D.Double(textPosition.getMaxX(),
|
Rectangle2D leftGap = new Rectangle2D.Double(textPosition.getMaxX(),
|
||||||
textPosition.getMinY(),
|
textPosition.getMinY(),
|
||||||
mainBodyTextFrame.getMaxX() - textPosition.getMaxX(),
|
mainBodyTextFrame.getMaxX() - textPosition.getMaxX(),
|
||||||
textPosition.getHeight());
|
textPosition.getHeight());
|
||||||
gapsInCurrentLine.add(leftGap);
|
gapsInCurrentLine.add(leftGap);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -152,9 +157,9 @@ public class GapDetectionService {
|
|||||||
public void addGapFromLeftEdgeOfMainBody(Rectangle2D textPosition) {
|
public void addGapFromLeftEdgeOfMainBody(Rectangle2D textPosition) {
|
||||||
|
|
||||||
Rectangle2D leftGap = new Rectangle2D.Double(mainBodyTextFrame.getMinX(),
|
Rectangle2D leftGap = new Rectangle2D.Double(mainBodyTextFrame.getMinX(),
|
||||||
textPosition.getMinY(),
|
textPosition.getMinY(),
|
||||||
textPosition.getMinX() - mainBodyTextFrame.getMinX(),
|
textPosition.getMinX() - mainBodyTextFrame.getMinX(),
|
||||||
textPosition.getHeight());
|
textPosition.getHeight());
|
||||||
gapsInCurrentLine.add(leftGap);
|
gapsInCurrentLine.add(leftGap);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@ -180,7 +180,7 @@ public class LineDetectionService {
|
|||||||
|
|
||||||
private Rectangle2D textPositionBBox(List<TextPositionSequence> textPositionSequences) {
|
private Rectangle2D textPositionBBox(List<TextPositionSequence> textPositionSequences) {
|
||||||
|
|
||||||
return RectangleTransformations.rectangleBBox(textPositionSequences.stream().map(TextPositionSequence::getRectangle).toList());
|
return RectangleTransformations.rectangle2DBBox(textPositionSequences.stream().map(TextPositionSequence::getBBox).toList());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@ -3,6 +3,7 @@ package com.knecon.fforesight.service.layoutparser.processor.services;
|
|||||||
import static com.knecon.fforesight.service.layoutparser.processor.utils.GeometricComparators.X_FIRST_RULING_COMPARATOR;
|
import static com.knecon.fforesight.service.layoutparser.processor.utils.GeometricComparators.X_FIRST_RULING_COMPARATOR;
|
||||||
|
|
||||||
import java.awt.geom.Point2D;
|
import java.awt.geom.Point2D;
|
||||||
|
import java.awt.geom.Rectangle2D;
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
import java.util.HashMap;
|
import java.util.HashMap;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
@ -12,9 +13,9 @@ import java.util.stream.Collectors;
|
|||||||
import org.springframework.stereotype.Service;
|
import org.springframework.stereotype.Service;
|
||||||
|
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings;
|
import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.Rectangle;
|
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling;
|
import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableCells;
|
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableCells;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.utils.UnionFind;
|
import com.knecon.fforesight.service.layoutparser.processor.utils.UnionFind;
|
||||||
|
|
||||||
import lombok.RequiredArgsConstructor;
|
import lombok.RequiredArgsConstructor;
|
||||||
@ -31,7 +32,7 @@ public class RulingCleaningService {
|
|||||||
private static final float THRESHOLD_Y_HORIZONTAL = 3;
|
private static final float THRESHOLD_Y_HORIZONTAL = 3;
|
||||||
|
|
||||||
|
|
||||||
public CleanRulings getCleanRulings(List<TableCells> tableCells, List<Ruling> rulings) {
|
public CleanRulings deduplicateAndStraightenRulings(List<TableCells> tableCells, List<Ruling> rulings) {
|
||||||
|
|
||||||
Rulings verticalAndHorizontalRulingLines;
|
Rulings verticalAndHorizontalRulingLines;
|
||||||
|
|
||||||
@ -45,43 +46,45 @@ public class RulingCleaningService {
|
|||||||
verticalAndHorizontalRulingLines.horizontalLines.sort(X_FIRST_RULING_COMPARATOR);
|
verticalAndHorizontalRulingLines.horizontalLines.sort(X_FIRST_RULING_COMPARATOR);
|
||||||
verticalAndHorizontalRulingLines = cleanRulings(verticalAndHorizontalRulingLines);
|
verticalAndHorizontalRulingLines = cleanRulings(verticalAndHorizontalRulingLines);
|
||||||
|
|
||||||
return CleanRulings.builder().vertical(verticalAndHorizontalRulingLines.verticalLines()).horizontal(verticalAndHorizontalRulingLines.horizontalLines()).build();
|
return new CleanRulings(verticalAndHorizontalRulingLines.horizontalLines(), verticalAndHorizontalRulingLines.verticalLines());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
private Rulings cleanRulings(Rulings rulings) {
|
private Rulings cleanRulings(Rulings rulings) {
|
||||||
|
|
||||||
List<List<Rectangle>> groupedOverlappingVerticalRectangles = groupOverlappingRectangles(rulings.verticalLines.stream()
|
List<List<Rectangle2D>> groupedOverlappingVerticalRectangles = groupOverlappingRectangles(rulings.verticalLines.stream()
|
||||||
.map(RulingCleaningService::getOverlapRectangle)
|
|
||||||
.distinct()
|
|
||||||
.toList());
|
|
||||||
List<Ruling> cleanedVerticalRulings = groupedOverlappingVerticalRectangles.stream()
|
|
||||||
.map(rectList -> getXCenteredRuling(Rectangle.boundingBoxOf(rectList)))
|
|
||||||
.toList();
|
|
||||||
|
|
||||||
List<List<Rectangle>> groupedOverlappingHorizontalRectangles = groupOverlappingRectangles(rulings.horizontalLines.stream()
|
|
||||||
.map(RulingCleaningService::getOverlapRectangle)
|
.map(RulingCleaningService::getOverlapRectangle)
|
||||||
.distinct()
|
.distinct()
|
||||||
.toList());
|
.toList());
|
||||||
|
List<Ruling> cleanedVerticalRulings = groupedOverlappingVerticalRectangles.stream()
|
||||||
|
.map(rectList -> getXCenteredRuling(RectangleTransformations.rectangle2DBBox(rectList)))
|
||||||
|
.filter(ruling -> ruling.length() > 0)
|
||||||
|
.toList();
|
||||||
|
|
||||||
|
List<List<Rectangle2D>> groupedOverlappingHorizontalRectangles = groupOverlappingRectangles(rulings.horizontalLines.stream()
|
||||||
|
.map(RulingCleaningService::getOverlapRectangle)
|
||||||
|
.distinct()
|
||||||
|
.toList());
|
||||||
|
|
||||||
List<Ruling> cleanedHorizontalRulings = groupedOverlappingHorizontalRectangles.stream()
|
List<Ruling> cleanedHorizontalRulings = groupedOverlappingHorizontalRectangles.stream()
|
||||||
.map(rectList -> getYCenteredRuling(Rectangle.boundingBoxOf(rectList)))
|
.map(rectList -> getYCenteredRuling(RectangleTransformations.rectangle2DBBox(rectList)))
|
||||||
|
.filter(ruling -> ruling.length() > 0)
|
||||||
.collect(Collectors.toList());
|
.collect(Collectors.toList());
|
||||||
|
|
||||||
return new Rulings(cleanedVerticalRulings, cleanedHorizontalRulings);
|
return new Rulings(cleanedVerticalRulings, cleanedHorizontalRulings);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
private List<List<Rectangle>> groupOverlappingRectangles(List<Rectangle> rectangles) {
|
private List<List<Rectangle2D>> groupOverlappingRectangles(List<Rectangle2D> rectangles) {
|
||||||
|
|
||||||
UnionFind<Rectangle> unionFind = new UnionFind<>();
|
UnionFind<Rectangle2D> unionFind = new UnionFind<>();
|
||||||
for (int i = 0; i < rectangles.size(); i++) {
|
for (int i = 0; i < rectangles.size(); i++) {
|
||||||
for (int j = i + 1; j < rectangles.size(); j++) {
|
for (int j = i + 1; j < rectangles.size(); j++) {
|
||||||
Rectangle rectangle1 = rectangles.get(i);
|
Rectangle2D rectangle1 = rectangles.get(i);
|
||||||
Rectangle rectangle2 = rectangles.get(j);
|
Rectangle2D rectangle2 = rectangles.get(j);
|
||||||
|
|
||||||
// we can stop early when we are too far off because of x-y-sorting
|
// we can stop early when we are too far off because of x-y-sorting
|
||||||
if(rectangle1.getRight() < rectangle2.getLeft() && rectangle1.getBottom() < rectangle2.getTop()) {
|
if (rectangle1.getMaxX() < rectangle2.getMinX() && rectangle1.getMaxY() < rectangle2.getMinY()) {
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -91,66 +94,66 @@ public class RulingCleaningService {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
Map<Rectangle, List<Rectangle>> groups = new HashMap<>();
|
Map<Rectangle2D, List<Rectangle2D>> groups = new HashMap<>();
|
||||||
for (Rectangle rectangle : rectangles) {
|
for (Rectangle2D rectangle : rectangles) {
|
||||||
Rectangle root = unionFind.find(rectangle);
|
Rectangle2D root = unionFind.find(rectangle);
|
||||||
groups.computeIfAbsent(root, k -> new ArrayList<>()).add(rectangle);
|
groups.computeIfAbsent(root, k -> new ArrayList<>()).add(rectangle);
|
||||||
}
|
}
|
||||||
return new ArrayList<>(groups.values());
|
return new ArrayList<>(groups.values());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
private static Rectangle getOverlapRectangle(Ruling ruling) {
|
private static Rectangle2D getOverlapRectangle(Ruling ruling) {
|
||||||
|
|
||||||
float top;
|
float y;
|
||||||
float left;
|
float x;
|
||||||
float w;
|
float w;
|
||||||
float h;
|
float h;
|
||||||
|
|
||||||
if (ruling.x1 < ruling.x2) {
|
if (ruling.x1 < ruling.x2) {
|
||||||
left = ruling.x1;
|
x = ruling.x1;
|
||||||
w = ruling.x2 - ruling.x1;
|
w = ruling.x2 - ruling.x1;
|
||||||
} else {
|
} else {
|
||||||
left = ruling.x2;
|
x = ruling.x2;
|
||||||
w = ruling.x1 - ruling.x2;
|
w = ruling.x1 - ruling.x2;
|
||||||
}
|
}
|
||||||
if (ruling.y1 < ruling.y2) {
|
if (ruling.y1 < ruling.y2) {
|
||||||
top = ruling.y1;
|
y = ruling.y1;
|
||||||
h = ruling.y2 - ruling.y1;
|
h = ruling.y2 - ruling.y1;
|
||||||
} else {
|
} else {
|
||||||
top = ruling.y2;
|
y = ruling.y2;
|
||||||
h = ruling.y1 - ruling.y2;
|
h = ruling.y1 - ruling.y2;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (ruling.horizontal()) {
|
if (ruling.isHorizontal()) {
|
||||||
return new Rectangle(top - THRESHOLD_Y_HORIZONTAL, left - THRESHOLD_X_HORIZONTAL, w + 2 * THRESHOLD_X_HORIZONTAL, h + 2 * THRESHOLD_Y_HORIZONTAL);
|
return new Rectangle2D.Double(x - THRESHOLD_X_HORIZONTAL, y - THRESHOLD_Y_HORIZONTAL, w + 2 * THRESHOLD_X_HORIZONTAL, h + 2 * THRESHOLD_Y_HORIZONTAL);
|
||||||
} else {
|
} else {
|
||||||
return new Rectangle(top - THRESHOLD_Y_VERTICAL, left - THRESHOLD_X_VERTICAL, w + 2 * THRESHOLD_X_VERTICAL, h + 2 * THRESHOLD_Y_VERTICAL);
|
return new Rectangle2D.Double(x - THRESHOLD_X_VERTICAL, y - THRESHOLD_Y_VERTICAL, w + 2 * THRESHOLD_X_VERTICAL, h + 2 * THRESHOLD_Y_VERTICAL);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
public static Ruling getXCenteredRuling(Rectangle rectangle) {
|
public static Ruling getXCenteredRuling(Rectangle2D rectangle) {
|
||||||
|
|
||||||
float x = (float) rectangle.getCenterX();
|
double x = rectangle.getCenterX();
|
||||||
float y1 = rectangle.getTop();
|
double y1 = rectangle.getMinY();
|
||||||
float y2 = rectangle.getBottom();
|
double y2 = rectangle.getMaxY();
|
||||||
|
|
||||||
Point2D point1 = new Point2D.Float(x, y1 + THRESHOLD_Y_VERTICAL);
|
Point2D point1 = new Point2D.Double(x, y1 + THRESHOLD_Y_VERTICAL);
|
||||||
Point2D point2 = new Point2D.Float(x, y2 - THRESHOLD_Y_VERTICAL);
|
Point2D point2 = new Point2D.Double(x, y2 - THRESHOLD_Y_VERTICAL);
|
||||||
|
|
||||||
return new Ruling(point1, point2);
|
return new Ruling(point1, point2);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
public static Ruling getYCenteredRuling(Rectangle rectangle) {
|
public static Ruling getYCenteredRuling(Rectangle2D rectangle) {
|
||||||
|
|
||||||
float x1 = rectangle.getLeft();
|
double x1 = rectangle.getX();
|
||||||
float x2 = rectangle.getRight();
|
double x2 = rectangle.getMaxX();
|
||||||
float y = (float) rectangle.getCenterY();
|
double y = rectangle.getCenterY();
|
||||||
|
|
||||||
Point2D point1 = new Point2D.Float(x1 + THRESHOLD_X_HORIZONTAL, y);
|
Point2D point1 = new Point2D.Double(x1 + THRESHOLD_X_HORIZONTAL, y);
|
||||||
Point2D point2 = new Point2D.Float(x2 - THRESHOLD_X_HORIZONTAL, y);
|
Point2D point2 = new Point2D.Double(x2 - THRESHOLD_X_HORIZONTAL, y);
|
||||||
|
|
||||||
return new Ruling(point1, point2);
|
return new Ruling(point1, point2);
|
||||||
}
|
}
|
||||||
@ -160,14 +163,14 @@ public class RulingCleaningService {
|
|||||||
|
|
||||||
List<Ruling> vrs = new ArrayList<>();
|
List<Ruling> vrs = new ArrayList<>();
|
||||||
for (Ruling vr : rulings) {
|
for (Ruling vr : rulings) {
|
||||||
if (vr.vertical()) {
|
if (vr.isVertical()) {
|
||||||
vrs.add(vr);
|
vrs.add(vr);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
List<Ruling> hrs = new ArrayList<>();
|
List<Ruling> hrs = new ArrayList<>();
|
||||||
for (Ruling hr : rulings) {
|
for (Ruling hr : rulings) {
|
||||||
if (hr.horizontal()) {
|
if (hr.isHorizontal()) {
|
||||||
hrs.add(hr);
|
hrs.add(hr);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@ -159,10 +159,10 @@ public class SectionsBuilderService {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
for (ClassificationSection section : sectionsOnPage) {
|
for (ClassificationSection section : sectionsOnPage) {
|
||||||
Float xMin = null;
|
Double xMin = null;
|
||||||
Float yMin = null;
|
Double yMin = null;
|
||||||
Float xMax = null;
|
Double xMax = null;
|
||||||
Float yMax = null;
|
Double yMax = null;
|
||||||
|
|
||||||
for (AbstractPageBlock abs : section.getPageBlocks()) {
|
for (AbstractPageBlock abs : section.getPageBlocks()) {
|
||||||
if (abs.getPage() != page.getPageNumber()) {
|
if (abs.getPage() != page.getPageNumber()) {
|
||||||
@ -244,7 +244,7 @@ public class SectionsBuilderService {
|
|||||||
.get(0)
|
.get(0)
|
||||||
.stream()
|
.stream()
|
||||||
.map(cell -> {
|
.map(cell -> {
|
||||||
Cell fakeCell = new Cell(cell.getPoints()[0], cell.getPoints()[2]);
|
Cell fakeCell = Cell.copy(cell);
|
||||||
fakeCell.setHeaderCells(Collections.singletonList(cell));
|
fakeCell.setHeaderCells(Collections.singletonList(cell));
|
||||||
return fakeCell;
|
return fakeCell;
|
||||||
})
|
})
|
||||||
|
|||||||
@ -3,6 +3,8 @@ package com.knecon.fforesight.service.layoutparser.processor.services;
|
|||||||
import static com.knecon.fforesight.service.layoutparser.processor.utils.GeometricComparators.CELL_SIZE_COMPARATOR;
|
import static com.knecon.fforesight.service.layoutparser.processor.utils.GeometricComparators.CELL_SIZE_COMPARATOR;
|
||||||
import static com.knecon.fforesight.service.layoutparser.processor.utils.GeometricComparators.RECTANGLE_SIZE_COMPARATOR;
|
import static com.knecon.fforesight.service.layoutparser.processor.utils.GeometricComparators.RECTANGLE_SIZE_COMPARATOR;
|
||||||
|
|
||||||
|
import java.awt.geom.AffineTransform;
|
||||||
|
import java.awt.geom.Rectangle2D;
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
import java.util.HashSet;
|
import java.util.HashSet;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
@ -11,22 +13,26 @@ import java.util.stream.Collectors;
|
|||||||
|
|
||||||
import org.springframework.stereotype.Service;
|
import org.springframework.stereotype.Service;
|
||||||
|
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.BoundingBox;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
|
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
|
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.Cell;
|
import com.knecon.fforesight.service.layoutparser.processor.model.table.Cell;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.Rectangle;
|
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling;
|
import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.TablePageBlock;
|
import com.knecon.fforesight.service.layoutparser.processor.model.table.TablePageBlock;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.model.text.RedTextPosition;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
|
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.utils.CoordinateTransforms;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.utils.DoubleComparisons;
|
import com.knecon.fforesight.service.layoutparser.processor.utils.DoubleComparisons;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.utils.PageInformation;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.utils.RectangularIntersectionFinder;
|
import com.knecon.fforesight.service.layoutparser.processor.utils.RectangularIntersectionFinder;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.utils.SpreadsheetFinder;
|
import com.knecon.fforesight.service.layoutparser.processor.utils.SpreadsheetFinder;
|
||||||
|
|
||||||
|
import lombok.SneakyThrows;
|
||||||
|
|
||||||
@Service
|
@Service
|
||||||
public class TableExtractionService {
|
public class TableExtractionService {
|
||||||
|
|
||||||
private static final int MAX_TABLE_CONTAINED_CELLS_WITH_TEXT = 1;
|
private static final int MAX_TABLE_CONTAINED_CELLS_WITH_TEXT = 1;
|
||||||
private static final int TEXT_BLOCK_CONTAINMENT_TOLERANCE = 2;
|
|
||||||
private static final double TABLE_UNIFORMITY_THRESHOLD = 0.7;
|
private static final double TABLE_UNIFORMITY_THRESHOLD = 0.7;
|
||||||
|
|
||||||
|
|
||||||
@ -59,29 +65,31 @@ public class TableExtractionService {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
var cells = new ArrayList<>(new HashSet<>(emptyCells));
|
List<Cell> cells = new ArrayList<>(new HashSet<>(emptyCells));
|
||||||
DoubleComparisons.sort(cells, Rectangle.ILL_DEFINED_ORDER);
|
DoubleComparisons.sort(cells, BoundingBox.ILL_DEFINED_ORDER);
|
||||||
|
|
||||||
List<Rectangle> spreadsheetAreas = SpreadsheetFinder.findSpreadsheetsFromCells(cells);
|
List<Rectangle2D> spreadsheetAreas = SpreadsheetFinder.findSpreadsheetsFromCells(cells);
|
||||||
// sort spreadsheetAreas by size (height * width) ascending so that cells are placed in the smallest tables first
|
// sort spreadsheetAreas by size (height * width) ascending so that cells are placed in the smallest tables first
|
||||||
// this way no cell duplication occurs when tables are contained in other tables and only the most inner table contains the cells
|
// this way no cell duplication occurs when tables are contained in other tables and only the most inner table contains the cells
|
||||||
spreadsheetAreas.sort(RECTANGLE_SIZE_COMPARATOR);
|
spreadsheetAreas.sort(RECTANGLE_SIZE_COMPARATOR);
|
||||||
|
|
||||||
List<TablePageBlock> tables = new ArrayList<>();
|
List<TablePageBlock> tables = new ArrayList<>();
|
||||||
for (Rectangle area : spreadsheetAreas) {
|
for (Rectangle2D area : spreadsheetAreas) {
|
||||||
|
|
||||||
List<Cell> containedCells = new ArrayList<>();
|
List<Cell> containedCells = new ArrayList<>();
|
||||||
for (Cell c : cells) {
|
for (Cell c : cells) {
|
||||||
if (c.hasMinimumSize() && area.contains(c)) {
|
if (c.hasMinimumSize() && area.contains(c.getBBoxInitialUserSpace())) {
|
||||||
containedCells.add(c);
|
containedCells.add(c);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
var containedCellsWithText = containedCells.stream().filter(cell -> !cell.getTextBlocks().isEmpty()).toList();
|
var containedCellsWithText = containedCells.stream()
|
||||||
|
.filter(cell -> !cell.getTextBlocks().isEmpty())
|
||||||
|
.toList();
|
||||||
|
|
||||||
// verify if table would contain fewer cells with text than the threshold allows
|
// verify if table would contain fewer cells with text than the threshold allows
|
||||||
if (containedCellsWithText.size() >= MAX_TABLE_CONTAINED_CELLS_WITH_TEXT && checkIfTableCellsAreUniform(containedCells)) {
|
if (containedCellsWithText.size() >= MAX_TABLE_CONTAINED_CELLS_WITH_TEXT && checkIfTableCellsAreUniform(containedCells)) {
|
||||||
tables.add(new TablePageBlock(containedCells, area, page.getRotation()));
|
tables.add(new TablePageBlock(containedCells, page.getRotation()));
|
||||||
cells.removeAll(containedCells);
|
cells.removeAll(containedCells);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -90,14 +98,18 @@ public class TableExtractionService {
|
|||||||
int position = -1;
|
int position = -1;
|
||||||
|
|
||||||
for (AbstractPageBlock pageBlock : page.getTextBlocks()) {
|
for (AbstractPageBlock pageBlock : page.getTextBlocks()) {
|
||||||
if (pageBlock instanceof TextPageBlock ? table.containsBlock((TextPageBlock) pageBlock) : table.contains(pageBlock) && position == -1) {
|
if (pageBlock instanceof TextPageBlock ? table.contains(pageBlock) : table.contains(pageBlock) && position == -1) {
|
||||||
position = page.getTextBlocks().indexOf(pageBlock);
|
position = page.getTextBlocks().indexOf(pageBlock);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (position != -1) {
|
if (position != -1) {
|
||||||
page.getTextBlocks().add(position, table);
|
page.getTextBlocks().add(position, table);
|
||||||
|
|
||||||
var toBeRemoved = table.getCells().stream().map(Cell::getTextBlocks).flatMap(List::stream).toList();
|
var toBeRemoved = table.getCells()
|
||||||
|
.stream()
|
||||||
|
.map(Cell::getTextBlocks)
|
||||||
|
.flatMap(List::stream)
|
||||||
|
.toList();
|
||||||
// remove text blocks from the page that were also added with the table (from its contained cells)
|
// remove text blocks from the page that were also added with the table (from its contained cells)
|
||||||
page.getTextBlocks().removeAll(toBeRemoved);
|
page.getTextBlocks().removeAll(toBeRemoved);
|
||||||
}
|
}
|
||||||
@ -112,7 +124,7 @@ public class TableExtractionService {
|
|||||||
}
|
}
|
||||||
|
|
||||||
Map<Long, List<Long>> cellsGroupedByRoundedWidth = containedCells.stream()
|
Map<Long, List<Long>> cellsGroupedByRoundedWidth = containedCells.stream()
|
||||||
.map(Rectangle::getWidth)
|
.map(BoundingBox::getWidth)
|
||||||
.map(size -> Math.round(size / 10.0) * 10)
|
.map(size -> Math.round(size / 10.0) * 10)
|
||||||
.collect(Collectors.groupingBy(Long::longValue));
|
.collect(Collectors.groupingBy(Long::longValue));
|
||||||
|
|
||||||
@ -122,25 +134,26 @@ public class TableExtractionService {
|
|||||||
|
|
||||||
private boolean doesCellContainTextBlock(Cell cell, TextPageBlock textBlock) {
|
private boolean doesCellContainTextBlock(Cell cell, TextPageBlock textBlock) {
|
||||||
|
|
||||||
if(cell.isEmpty() || textBlock.getSequences().isEmpty()) {
|
return cell.contains(textBlock, RedTextPosition.HEIGHT_PADDING);
|
||||||
return false;
|
|
||||||
}
|
|
||||||
double x = textBlock.getPdfMinX();
|
|
||||||
double y = textBlock.getPdfMinY();
|
|
||||||
double w = textBlock.getPdfMaxX() - textBlock.getPdfMinX();
|
|
||||||
double h = textBlock.getPdfMaxY() - textBlock.getPdfMinY();
|
|
||||||
if (w <= 0 || h <= 0) {
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
double x0 = cell.getX();
|
|
||||||
double y0 = cell.getY();
|
|
||||||
return (x >= x0 - TEXT_BLOCK_CONTAINMENT_TOLERANCE && y >= y0 - TEXT_BLOCK_CONTAINMENT_TOLERANCE && (x + w) <= x0 + cell.getWidth() + 2 * TEXT_BLOCK_CONTAINMENT_TOLERANCE && (y + h) <= y0 + cell.getHeight() + 2 * TEXT_BLOCK_CONTAINMENT_TOLERANCE);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
public static List<Cell> findCells(List<Ruling> horizontalRulingLines, List<Ruling> verticalRulingLines) {
|
@SneakyThrows
|
||||||
|
public static List<Cell> findCells(List<Ruling> horizontalRulingLines, List<Ruling> verticalRulingLines, PageInformation pageInformation) {
|
||||||
|
|
||||||
return RectangularIntersectionFinder.find(horizontalRulingLines, verticalRulingLines).stream().map(Cell::new).collect(Collectors.toList());
|
AffineTransform affineTransform = CoordinateTransforms.calculateInitialUserSpaceCoordsToImageCoords(pageInformation, 1);
|
||||||
|
/*
|
||||||
|
switch (pageInformation.rotationDegrees()) {
|
||||||
|
case 90 -> affineTransform.translate(RedTextPosition.HEIGHT_PADDING, 0); //although this is wrong, our text coordinates are wrong as well
|
||||||
|
case 180 -> affineTransform.translate(0, RedTextPosition.HEIGHT_PADDING);
|
||||||
|
case 270 -> affineTransform.translate(-RedTextPosition.HEIGHT_PADDING, 0);
|
||||||
|
default -> affineTransform.translate(0, -RedTextPosition.HEIGHT_PADDING);
|
||||||
|
}
|
||||||
|
*/
|
||||||
|
return RectangularIntersectionFinder.find(horizontalRulingLines, verticalRulingLines)
|
||||||
|
.stream()
|
||||||
|
.map(rect -> new Cell(rect, affineTransform))
|
||||||
|
.collect(Collectors.toList());
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|||||||
@ -0,0 +1,99 @@
|
|||||||
|
package com.knecon.fforesight.service.layoutparser.processor.services;
|
||||||
|
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextDirection;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
|
||||||
|
|
||||||
|
import lombok.experimental.UtilityClass;
|
||||||
|
|
||||||
|
@UtilityClass
|
||||||
|
public class TextRulingsClassifier {
|
||||||
|
|
||||||
|
private final static double STRIKETHROUGH_ZONE = 0.5; // multiplied with text height, determines height of intersection interval for strikethrough lines.
|
||||||
|
private final static double UNDERLINE_ZONE = 0.2; // multiplied with text height, determines height of intersection interval of underline lines.
|
||||||
|
private final static double TEXT_BBOX_THRESHOLD_FACTOR = 0.15; // multiplied with text width then subtracted from word width. If ruling covers this width, it is considered as strikethrough/underline.
|
||||||
|
|
||||||
|
|
||||||
|
public static void classifyUnderlinedAndStrikethroughText(List<TextPositionSequence> words, CleanRulings cleanRulings) {
|
||||||
|
|
||||||
|
for (TextPositionSequence word : words) {
|
||||||
|
if (word.getDir().equals(TextDirection.ZERO) || word.getDir().equals(TextDirection.HALF_CIRCLE)) {
|
||||||
|
handleHorizontalText(cleanRulings, word);
|
||||||
|
} else {
|
||||||
|
handleVerticalText(cleanRulings, word);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private static void handleVerticalText(CleanRulings cleanRulings, TextPositionSequence word) {
|
||||||
|
|
||||||
|
float lowerY = (float) (word.getBBoxInitialUserSpace().getMinY() + TEXT_BBOX_THRESHOLD_FACTOR * word.getWidth());
|
||||||
|
float upperY = (float) (word.getBBoxInitialUserSpace().getMaxY() - TEXT_BBOX_THRESHOLD_FACTOR * word.getWidth());
|
||||||
|
|
||||||
|
float strikethroughCenterX = (float) word.getBBoxInitialUserSpace().getCenterX();
|
||||||
|
float strikethroughBoxHeight = (float) ((word.getBBoxDirAdj().getHeight() * STRIKETHROUGH_ZONE) / 2);
|
||||||
|
|
||||||
|
float underlineCenterX = (float) (word.getDir().equals(TextDirection.QUARTER_CIRCLE) ? word.getBBoxInitialUserSpace().getMaxX() : word.getBBoxInitialUserSpace().getMinX());
|
||||||
|
float underlineBoxHeight = (float) ((word.getBBoxDirAdj().getHeight() * UNDERLINE_ZONE) / 2);
|
||||||
|
|
||||||
|
float leftX = Math.min(underlineCenterX - underlineBoxHeight, strikethroughCenterX - strikethroughBoxHeight);
|
||||||
|
float rightX = Math.max(underlineCenterX + underlineBoxHeight, strikethroughCenterX + strikethroughBoxHeight);
|
||||||
|
|
||||||
|
List<Ruling> rulingsIntersectingWord = cleanRulings.getVerticalsInXInterval(leftX, rightX)
|
||||||
|
.stream()
|
||||||
|
.filter(ruling -> ruling.getClassification().equals(Ruling.Classification.OTHER))
|
||||||
|
.filter(ruling -> ruling.y1 <= lowerY && upperY <= ruling.y2)
|
||||||
|
.toList();
|
||||||
|
|
||||||
|
for (Ruling ruling : rulingsIntersectingWord) {
|
||||||
|
if (strikethroughCenterX - strikethroughBoxHeight < ruling.x1 && ruling.x1 < strikethroughCenterX + strikethroughBoxHeight) {
|
||||||
|
ruling.setClassification(Ruling.Classification.STRIKETROUGH);
|
||||||
|
word.setStrikethrough(true);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (underlineCenterX - underlineBoxHeight < ruling.x1 && ruling.x1 < underlineCenterX + underlineBoxHeight) {
|
||||||
|
ruling.setClassification(Ruling.Classification.UNDERLINE);
|
||||||
|
word.setUnderline(true);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private static void handleHorizontalText(CleanRulings cleanRulings, TextPositionSequence word) {
|
||||||
|
|
||||||
|
float leftX = (float) (word.getBBoxInitialUserSpace().getMinX() + TEXT_BBOX_THRESHOLD_FACTOR * word.getWidth());
|
||||||
|
float rightX = (float) (word.getBBoxInitialUserSpace().getMaxX() - TEXT_BBOX_THRESHOLD_FACTOR * word.getWidth());
|
||||||
|
|
||||||
|
float strikethroughCenterY = (float) word.getBBoxInitialUserSpace().getCenterY();
|
||||||
|
float strikethroughBoxHeight = (float) ((word.getBBoxDirAdj().getHeight() * STRIKETHROUGH_ZONE) / 2);
|
||||||
|
|
||||||
|
float underlineCenterY = (float) (word.getDir().equals(TextDirection.ZERO) ? word.getBBoxInitialUserSpace().getMinY() : word.getBBoxInitialUserSpace().getMaxY());
|
||||||
|
float underlineBoxHeight = (float) ((word.getBBoxDirAdj().getHeight() * UNDERLINE_ZONE) / 2);
|
||||||
|
|
||||||
|
float lowerY = Math.min(underlineCenterY - underlineBoxHeight, strikethroughCenterY - strikethroughBoxHeight);
|
||||||
|
float upperY = Math.max(underlineCenterY + underlineBoxHeight, strikethroughCenterY + strikethroughBoxHeight);
|
||||||
|
|
||||||
|
List<Ruling> rulingsIntersectingWord = cleanRulings.getHorizontalsInYInterval(lowerY, upperY)
|
||||||
|
.stream()
|
||||||
|
.filter(ruling -> ruling.getClassification().equals(Ruling.Classification.OTHER))
|
||||||
|
.filter(ruling -> ruling.x1 <= leftX && rightX <= ruling.x2)
|
||||||
|
.toList();
|
||||||
|
|
||||||
|
for (Ruling ruling : rulingsIntersectingWord) {
|
||||||
|
if (strikethroughCenterY - strikethroughBoxHeight < ruling.y1 && ruling.y1 < strikethroughCenterY + strikethroughBoxHeight) {
|
||||||
|
ruling.setClassification(Ruling.Classification.STRIKETROUGH);
|
||||||
|
word.setStrikethrough(true);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (underlineCenterY - underlineBoxHeight < ruling.y1 && ruling.y1 < underlineCenterY + underlineBoxHeight) {
|
||||||
|
ruling.setClassification(Ruling.Classification.UNDERLINE);
|
||||||
|
word.setUnderline(true);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
@ -1,7 +1,5 @@
|
|||||||
package com.knecon.fforesight.service.layoutparser.processor.services.blockification;
|
package com.knecon.fforesight.service.layoutparser.processor.services.blockification;
|
||||||
|
|
||||||
import static java.util.stream.Collectors.toSet;
|
|
||||||
|
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
import java.util.Comparator;
|
import java.util.Comparator;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
@ -9,21 +7,17 @@ import java.util.ListIterator;
|
|||||||
|
|
||||||
import org.springframework.stereotype.Service;
|
import org.springframework.stereotype.Service;
|
||||||
|
|
||||||
|
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.docstrum.DocstrumSegmentationService;
|
import com.knecon.fforesight.service.layoutparser.processor.docstrum.DocstrumSegmentationService;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.Zone;
|
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.Zone;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.docstrum.utils.DoubleUtils;
|
import com.knecon.fforesight.service.layoutparser.processor.docstrum.utils.DoubleUtils;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
|
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
|
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.FloatFrequencyCounter;
|
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.Cell;
|
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings;
|
import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling;
|
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.TablePageBlock;
|
import com.knecon.fforesight.service.layoutparser.processor.model.table.TablePageBlock;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.StringFrequencyCounter;
|
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
|
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
|
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations;
|
import com.knecon.fforesight.service.layoutparser.processor.visualization.LayoutparsingVisualizations;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.utils.RulingTextDirAdjustUtil;
|
|
||||||
|
|
||||||
import lombok.RequiredArgsConstructor;
|
import lombok.RequiredArgsConstructor;
|
||||||
|
|
||||||
@ -37,22 +31,60 @@ public class DocstrumBlockificationService {
|
|||||||
static final float THRESHOLD = 1f;
|
static final float THRESHOLD = 1f;
|
||||||
|
|
||||||
|
|
||||||
public ClassificationPage blockify(List<TextPositionSequence> textPositions, List<Cell> cells, boolean xyOrder) {
|
public ClassificationPage blockify(List<TextPositionSequence> textPositions,
|
||||||
|
CleanRulings rulings,
|
||||||
|
boolean xyOrder,
|
||||||
|
LayoutparsingVisualizations visualizations,
|
||||||
|
LayoutParsingType layoutParsingType) {
|
||||||
|
|
||||||
CleanRulings usedRulings = RectangleTransformations.extractRulings(cells);
|
CleanRulings usedRulings = rulings.withoutTextRulings();
|
||||||
|
|
||||||
var zones = docstrumSegmentationService.segmentPage(textPositions, xyOrder);
|
var zones = docstrumSegmentationService.segmentPage(textPositions, xyOrder, usedRulings, visualizations);
|
||||||
var pageBlocks = toAbstractPageBlocks(zones, usedRulings.getHorizontal(), usedRulings.getVertical(), xyOrder);
|
|
||||||
|
if (!textPositions.isEmpty()) {
|
||||||
|
visualizations.addZoneVisualizations(zones, textPositions.get(0).getPage());
|
||||||
|
visualizations.addLineVisualizationsFromZones(zones, textPositions.get(0).getPage());
|
||||||
|
visualizations.addCharactersWithNeighbours(zones, textPositions.get(0).getPage());
|
||||||
|
}
|
||||||
|
|
||||||
|
var pageBlocks = toAbstractPageBlocks(zones, xyOrder, usedRulings);
|
||||||
|
|
||||||
|
if (xyOrder) {
|
||||||
|
sortPageBlocksXThenY(pageBlocks);
|
||||||
|
}
|
||||||
|
|
||||||
var classificationPage = new ClassificationPage(pageBlocks);
|
var classificationPage = new ClassificationPage(pageBlocks);
|
||||||
|
classificationPage.setCleanRulings(rulings);
|
||||||
|
|
||||||
mergeIntersectingBlocks(classificationPage.getTextBlocks(), 0, 0);
|
mergeIntersectingBlocks(classificationPage, usedRulings, 0, 0);
|
||||||
|
|
||||||
|
if (layoutParsingType == LayoutParsingType.DOCUMINE || layoutParsingType == LayoutParsingType.REDACT_MANAGER) {
|
||||||
|
combineBlocks(classificationPage);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (layoutParsingType == LayoutParsingType.CLARIFYND) {
|
||||||
|
mergeIntersectingBlocks(classificationPage, usedRulings, 0, 6.5f);
|
||||||
|
}
|
||||||
|
|
||||||
return classificationPage;
|
return classificationPage;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
private List<AbstractPageBlock> toAbstractPageBlocks(List<Zone> zones, List<Ruling> horizontalRulings, List<Ruling> verticalRulings, boolean xyOrder) {
|
private static void sortPageBlocksXThenY(List<AbstractPageBlock> pageBlocks) {
|
||||||
|
|
||||||
|
pageBlocks.sort(Comparator.comparing(AbstractPageBlock::getMinY, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD))
|
||||||
|
.thenComparing(AbstractPageBlock::getMinX, (o1, o2) -> DoubleUtils.compareDouble(o1, o2, THRESHOLD)));
|
||||||
|
pageBlocks.sort(new Comparator<AbstractPageBlock>() {
|
||||||
|
@Override
|
||||||
|
public int compare(AbstractPageBlock o1, AbstractPageBlock o2) {
|
||||||
|
|
||||||
|
return Math.abs(o1.getMinY() - o2.getMinY()) < 5 && o1.getMinX() < o2.getMinX() == true ? -1 : 0;
|
||||||
|
}
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private List<AbstractPageBlock> toAbstractPageBlocks(List<Zone> zones, boolean xyOrder, CleanRulings usedRulings) {
|
||||||
|
|
||||||
List<AbstractPageBlock> abstractPageBlocks = new ArrayList<>();
|
List<AbstractPageBlock> abstractPageBlocks = new ArrayList<>();
|
||||||
zones.forEach(zone -> {
|
zones.forEach(zone -> {
|
||||||
@ -66,7 +98,7 @@ public class DocstrumBlockificationService {
|
|||||||
});
|
});
|
||||||
});
|
});
|
||||||
|
|
||||||
abstractPageBlocks.addAll(splitZonesAtRulings(textPositionSequences, horizontalRulings, verticalRulings));
|
abstractPageBlocks.add(buildTextBlock(textPositionSequences, 0));
|
||||||
});
|
});
|
||||||
|
|
||||||
if (xyOrder) {
|
if (xyOrder) {
|
||||||
@ -89,6 +121,7 @@ public class DocstrumBlockificationService {
|
|||||||
|
|
||||||
TextPageBlock previous = new TextPageBlock();
|
TextPageBlock previous = new TextPageBlock();
|
||||||
ListIterator<AbstractPageBlock> itty = page.getTextBlocks().listIterator();
|
ListIterator<AbstractPageBlock> itty = page.getTextBlocks().listIterator();
|
||||||
|
CleanRulings usedRulings = page.getCleanRulings().withoutTextRulings();
|
||||||
while (itty.hasNext()) {
|
while (itty.hasNext()) {
|
||||||
|
|
||||||
AbstractPageBlock block = itty.next();
|
AbstractPageBlock block = itty.next();
|
||||||
@ -100,7 +133,7 @@ public class DocstrumBlockificationService {
|
|||||||
|
|
||||||
if (previous != null && !previous.getSequences().isEmpty()) {
|
if (previous != null && !previous.getSequences().isEmpty()) {
|
||||||
|
|
||||||
if (current.getDir() != previous.getDir()) {
|
if (current.getDir() != previous.getDir() || usedRulings.lineBetween(current, previous)) {
|
||||||
previous = current;
|
previous = current;
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
@ -120,7 +153,7 @@ public class DocstrumBlockificationService {
|
|||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (previous.almostIntersects(current, 0, 0)) {
|
if (previous.intersects(current)) {
|
||||||
previous = combineBlocksAndResetIterator(previous, current, itty, previous.isToDuplicate());
|
previous = combineBlocksAndResetIterator(previous, current, itty, previous.isToDuplicate());
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
@ -139,7 +172,7 @@ public class DocstrumBlockificationService {
|
|||||||
previous = current;
|
previous = current;
|
||||||
}
|
}
|
||||||
|
|
||||||
mergeIntersectingBlocks(page.getTextBlocks(), 0, 6.5f);
|
mergeIntersectingBlocks(page, usedRulings, 0, 6.5f);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@ -230,8 +263,9 @@ public class DocstrumBlockificationService {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
public void mergeIntersectingBlocks(List<AbstractPageBlock> blocks, float xThreshold, float yThreshold) {
|
public void mergeIntersectingBlocks(ClassificationPage page, CleanRulings usedRulings, float xThreshold, float yThreshold) {
|
||||||
|
|
||||||
|
var blocks = page.getTextBlocks();
|
||||||
ListIterator<AbstractPageBlock> itty = blocks.listIterator();
|
ListIterator<AbstractPageBlock> itty = blocks.listIterator();
|
||||||
while (itty.hasNext()) {
|
while (itty.hasNext()) {
|
||||||
AbstractPageBlock block = itty.next();
|
AbstractPageBlock block = itty.next();
|
||||||
@ -267,7 +301,11 @@ public class DocstrumBlockificationService {
|
|||||||
|
|
||||||
TextPageBlock inner = (TextPageBlock) abstractPageBlock;
|
TextPageBlock inner = (TextPageBlock) abstractPageBlock;
|
||||||
|
|
||||||
if (current.getDir() == inner.getDir() && current.almostIntersects(inner, yThreshold, xThreshold)) {
|
if (usedRulings.lineBetween(current, blocks.get(i))) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (current.getDir() == inner.getDir() && current.intersects(inner, yThreshold, xThreshold)) {
|
||||||
|
|
||||||
boolean toDuplicate = current.isToDuplicate() || inner.isToDuplicate();
|
boolean toDuplicate = current.isToDuplicate() || inner.isToDuplicate();
|
||||||
current.getSequences().addAll(inner.getSequences());
|
current.getSequences().addAll(inner.getSequences());
|
||||||
@ -351,111 +389,7 @@ public class DocstrumBlockificationService {
|
|||||||
|
|
||||||
public static TextPageBlock buildTextBlock(List<TextPositionSequence> wordBlockList, int indexOnPage) {
|
public static TextPageBlock buildTextBlock(List<TextPositionSequence> wordBlockList, int indexOnPage) {
|
||||||
|
|
||||||
TextPageBlock textBlock = null;
|
return new TextPageBlock(wordBlockList);
|
||||||
|
|
||||||
FloatFrequencyCounter lineHeightFrequencyCounter = new FloatFrequencyCounter();
|
|
||||||
FloatFrequencyCounter fontSizeFrequencyCounter = new FloatFrequencyCounter();
|
|
||||||
FloatFrequencyCounter spaceFrequencyCounter = new FloatFrequencyCounter();
|
|
||||||
StringFrequencyCounter fontFrequencyCounter = new StringFrequencyCounter();
|
|
||||||
StringFrequencyCounter styleFrequencyCounter = new StringFrequencyCounter();
|
|
||||||
|
|
||||||
for (TextPositionSequence wordBlock : wordBlockList) {
|
|
||||||
|
|
||||||
lineHeightFrequencyCounter.add(wordBlock.getTextHeight());
|
|
||||||
fontSizeFrequencyCounter.add(wordBlock.getFontSize());
|
|
||||||
spaceFrequencyCounter.add(wordBlock.getSpaceWidth());
|
|
||||||
fontFrequencyCounter.add(wordBlock.getFont());
|
|
||||||
styleFrequencyCounter.add(wordBlock.getFontStyle());
|
|
||||||
|
|
||||||
if (textBlock == null) {
|
|
||||||
textBlock = new TextPageBlock(wordBlock.getMinXDirAdj(),
|
|
||||||
wordBlock.getMaxXDirAdj(),
|
|
||||||
wordBlock.getMinYDirAdj(),
|
|
||||||
wordBlock.getMaxYDirAdj(),
|
|
||||||
wordBlockList,
|
|
||||||
wordBlock.getRotation());
|
|
||||||
} else {
|
|
||||||
TextPageBlock spatialEntity = textBlock.union(wordBlock);
|
|
||||||
textBlock.resize(spatialEntity.getMinX(), spatialEntity.getMinY(), spatialEntity.getWidth(), spatialEntity.getHeight());
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if (textBlock != null) {
|
|
||||||
textBlock.setMostPopularWordFont(fontFrequencyCounter.getMostPopular());
|
|
||||||
textBlock.setMostPopularWordStyle(styleFrequencyCounter.getMostPopular());
|
|
||||||
textBlock.setMostPopularWordFontSize(fontSizeFrequencyCounter.getMostPopular());
|
|
||||||
textBlock.setMostPopularWordHeight(lineHeightFrequencyCounter.getMostPopular());
|
|
||||||
textBlock.setMostPopularWordSpaceWidth(spaceFrequencyCounter.getMostPopular());
|
|
||||||
textBlock.setHighestFontSize(fontSizeFrequencyCounter.getHighest());
|
|
||||||
}
|
|
||||||
|
|
||||||
if (textBlock != null
|
|
||||||
&& textBlock.getSequences() != null
|
|
||||||
&& textBlock.getSequences()
|
|
||||||
.stream()
|
|
||||||
.map(t -> round(t.getMinYDirAdj(), 3))
|
|
||||||
.collect(toSet()).size() == 1) {
|
|
||||||
textBlock.getSequences().sort(Comparator.comparing(TextPositionSequence::getMinXDirAdj));
|
|
||||||
}
|
|
||||||
return textBlock;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
private boolean isSplitByRuling(float minX,
|
|
||||||
float minY,
|
|
||||||
float maxX,
|
|
||||||
float maxY,
|
|
||||||
TextPositionSequence word,
|
|
||||||
List<Ruling> horizontalRulingLines,
|
|
||||||
List<Ruling> verticalRulingLines) {
|
|
||||||
|
|
||||||
return isSplitByRuling(maxX, minY, word.getMinXDirAdj(), word.getMinYDirAdj(), verticalRulingLines, word.getDir().getDegrees(), word.getPageWidth(), word.getPageHeight())
|
|
||||||
//
|
|
||||||
|| isSplitByRuling(minX,
|
|
||||||
minY,
|
|
||||||
word.getMinXDirAdj(),
|
|
||||||
word.getMaxYDirAdj(),
|
|
||||||
horizontalRulingLines,
|
|
||||||
word.getDir().getDegrees(),
|
|
||||||
word.getPageWidth(),
|
|
||||||
word.getPageHeight())
|
|
||||||
//
|
|
||||||
|| isSplitByRuling(maxX,
|
|
||||||
minY,
|
|
||||||
word.getMinXDirAdj(),
|
|
||||||
word.getMinYDirAdj(),
|
|
||||||
horizontalRulingLines,
|
|
||||||
word.getDir().getDegrees(),
|
|
||||||
word.getPageWidth(),
|
|
||||||
word.getPageHeight())
|
|
||||||
//
|
|
||||||
|| isSplitByRuling(minX,
|
|
||||||
minY,
|
|
||||||
word.getMinXDirAdj(),
|
|
||||||
word.getMaxYDirAdj(),
|
|
||||||
verticalRulingLines,
|
|
||||||
word.getDir().getDegrees(),
|
|
||||||
word.getPageWidth(),
|
|
||||||
word.getPageHeight());
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
private boolean isSplitByRuling(float previousX2, float previousY1, float currentX1, float currentY1, List<Ruling> rulingLines, float dir, float pageWidth, float pageHeight) {
|
|
||||||
|
|
||||||
for (Ruling ruling : rulingLines) {
|
|
||||||
var line = RulingTextDirAdjustUtil.convertToDirAdj(ruling, dir, pageWidth, pageHeight);
|
|
||||||
if (line.intersectsLine(previousX2, previousY1, currentX1, currentY1)) {
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
private static double round(float value, int decimalPoints) {
|
|
||||||
|
|
||||||
var d = Math.pow(10, decimalPoints);
|
|
||||||
return Math.round(value * d) / d;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|||||||
@ -15,11 +15,10 @@ import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBl
|
|||||||
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
|
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.FloatFrequencyCounter;
|
import com.knecon.fforesight.service.layoutparser.processor.model.FloatFrequencyCounter;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.Orientation;
|
import com.knecon.fforesight.service.layoutparser.processor.model.Orientation;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling;
|
import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.StringFrequencyCounter;
|
import com.knecon.fforesight.service.layoutparser.processor.model.text.StringFrequencyCounter;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
|
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
|
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.utils.RulingTextDirAdjustUtil;
|
|
||||||
|
|
||||||
@Service
|
@Service
|
||||||
public class DocuMineBlockificationService {
|
public class DocuMineBlockificationService {
|
||||||
@ -34,15 +33,16 @@ public class DocuMineBlockificationService {
|
|||||||
* This method must use text direction adjusted postions (DirAdj). Where {0,0} is on the upper left. Never try to change this!
|
* This method must use text direction adjusted postions (DirAdj). Where {0,0} is on the upper left. Never try to change this!
|
||||||
* Rulings (Table lines) must be adjusted to the text directions as well, when checking if a block is split by a ruling.
|
* Rulings (Table lines) must be adjusted to the text directions as well, when checking if a block is split by a ruling.
|
||||||
*
|
*
|
||||||
* @param textPositions The words of a page.
|
* @param textPositions The textPositions of a page.
|
||||||
* @param horizontalRulingLines Horizontal table lines.
|
* @param cleanRulings All rulings on a page
|
||||||
* @param verticalRulingLines Vertical table lines.
|
|
||||||
* @return Page object that contains the Textblock and text statistics.
|
* @return Page object that contains the Textblock and text statistics.
|
||||||
*/
|
*/
|
||||||
public ClassificationPage blockify(List<TextPositionSequence> textPositions, List<Ruling> horizontalRulingLines, List<Ruling> verticalRulingLines) {
|
public ClassificationPage blockify(List<TextPositionSequence> textPositions, CleanRulings cleanRulings) {
|
||||||
|
|
||||||
List<TextPositionSequence> chunkWords = new ArrayList<>();
|
List<TextPositionSequence> chunkWords = new ArrayList<>();
|
||||||
List<AbstractPageBlock> chunkBlockList1 = new ArrayList<>();
|
List<AbstractPageBlock> textPageBlocks = new ArrayList<>();
|
||||||
|
|
||||||
|
CleanRulings usedRulings = cleanRulings.withoutTextRulings();
|
||||||
|
|
||||||
float minX = 1000;
|
float minX = 1000;
|
||||||
float maxX = 0;
|
float maxX = 0;
|
||||||
@ -59,23 +59,26 @@ public class DocuMineBlockificationService {
|
|||||||
boolean splitByX = prev != null && maxX + 50 < word.getMinXDirAdj() && prev.getMinYDirAdj() == word.getMinYDirAdj();
|
boolean splitByX = prev != null && maxX + 50 < word.getMinXDirAdj() && prev.getMinYDirAdj() == word.getMinYDirAdj();
|
||||||
boolean negativeXGap = prev != null && word.getMinXDirAdj() - minX < -5;
|
boolean negativeXGap = prev != null && word.getMinXDirAdj() - minX < -5;
|
||||||
boolean newLineAfterSplit = prev != null && word.getMinYDirAdj() != prev.getMinYDirAdj() && wasSplitted && splitX1 != word.getMinXDirAdj();
|
boolean newLineAfterSplit = prev != null && word.getMinYDirAdj() != prev.getMinYDirAdj() && wasSplitted && splitX1 != word.getMinXDirAdj();
|
||||||
boolean isSplitByRuling = isSplitByRuling(minX, minY, maxX, maxY, word, horizontalRulingLines, verticalRulingLines);
|
boolean isSplitByRuling = prev != null && usedRulings.lineBetween(prev, word);
|
||||||
boolean splitByDir = prev != null && !prev.getDir().equals(word.getDir());
|
boolean splitByDir = prev != null && !prev.getDir().equals(word.getDir());
|
||||||
boolean splitByOtherFontAndOtherY = prev != null && Math.abs(prev.getMaxYDirAdj() - word.getMaxYDirAdj()) > word.getTextHeight() * 0.2 && (word.getFontStyle()
|
boolean splitByOtherFontAndOtherY = prev != null && Math.abs(prev.getMaxYDirAdj() - word.getMaxYDirAdj()) > word.getTextHeight() * 0.2 && (word.getFontStyle()
|
||||||
.contains("bold") && !prev.getFontStyle().contains("bold") || prev.getFontStyle().contains("bold") && !word.getFontStyle().contains("bold"));
|
.contains("bold")
|
||||||
|
&& !prev.getFontStyle()
|
||||||
|
.contains("bold") || prev.getFontStyle().contains("bold") && !word.getFontStyle().contains("bold"));
|
||||||
|
|
||||||
Matcher matcher = pattern.matcher(chunkWords.stream().collect(Collectors.joining(" ")).toString());
|
Matcher matcher = pattern.matcher(chunkWords.stream()
|
||||||
|
.collect(Collectors.joining(" ")).toString());
|
||||||
boolean startsOnSameX = Math.abs(minX - word.getMinXDirAdj()) < 5 && matcher.matches();
|
boolean startsOnSameX = Math.abs(minX - word.getMinXDirAdj()) < 5 && matcher.matches();
|
||||||
|
|
||||||
if (prev != null && (lineSeparation || startFromTop || splitByDir || isSplitByRuling || splitByOtherFontAndOtherY || negativeXGap || startsOnSameX)) {
|
if (prev != null && (lineSeparation || startFromTop || splitByDir || isSplitByRuling || splitByOtherFontAndOtherY || negativeXGap || startsOnSameX)) {
|
||||||
|
|
||||||
Orientation prevOrientation = null;
|
Orientation prevOrientation = null;
|
||||||
if (!chunkBlockList1.isEmpty()) {
|
if (!textPageBlocks.isEmpty()) {
|
||||||
prevOrientation = chunkBlockList1.get(chunkBlockList1.size() - 1).getOrientation();
|
prevOrientation = textPageBlocks.get(textPageBlocks.size() - 1).getOrientation();
|
||||||
}
|
}
|
||||||
|
|
||||||
TextPageBlock cb1 = buildTextBlock(chunkWords);
|
TextPageBlock cb1 = new TextPageBlock(chunkWords);
|
||||||
chunkBlockList1.add(cb1);
|
textPageBlocks.add(cb1);
|
||||||
chunkWords = new ArrayList<>();
|
chunkWords = new ArrayList<>();
|
||||||
|
|
||||||
if (splitByX && !isSplitByRuling) {
|
if (splitByX && !isSplitByRuling) {
|
||||||
@ -86,7 +89,11 @@ public class DocuMineBlockificationService {
|
|||||||
wasSplitted = false;
|
wasSplitted = false;
|
||||||
cb1.setOrientation(Orientation.RIGHT);
|
cb1.setOrientation(Orientation.RIGHT);
|
||||||
splitX1 = null;
|
splitX1 = null;
|
||||||
} else if (prevOrientation != null && prevOrientation.equals(Orientation.RIGHT) && (lineSeparation || !startFromTop || !splitByX || !newLineAfterSplit || !isSplitByRuling)) {
|
} else if (prevOrientation != null && prevOrientation.equals(Orientation.RIGHT) && (lineSeparation
|
||||||
|
|| !startFromTop
|
||||||
|
|| !splitByX
|
||||||
|
|| !newLineAfterSplit
|
||||||
|
|| !isSplitByRuling)) {
|
||||||
cb1.setOrientation(Orientation.LEFT);
|
cb1.setOrientation(Orientation.LEFT);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -114,128 +121,12 @@ public class DocuMineBlockificationService {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
TextPageBlock cb1 = buildTextBlock(chunkWords);
|
textPageBlocks.add(new TextPageBlock(chunkWords));
|
||||||
if (cb1 != null) {
|
|
||||||
chunkBlockList1.add(cb1);
|
|
||||||
}
|
|
||||||
|
|
||||||
return new ClassificationPage(chunkBlockList1);
|
return new ClassificationPage(textPageBlocks);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
private boolean equalsWithThreshold(float f1, float f2) {
|
|
||||||
|
|
||||||
return Math.abs(f1 - f2) < THRESHOLD;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
private TextPageBlock buildTextBlock(List<TextPositionSequence> wordBlockList) {
|
|
||||||
|
|
||||||
TextPageBlock textBlock = null;
|
|
||||||
|
|
||||||
FloatFrequencyCounter lineHeightFrequencyCounter = new FloatFrequencyCounter();
|
|
||||||
FloatFrequencyCounter fontSizeFrequencyCounter = new FloatFrequencyCounter();
|
|
||||||
FloatFrequencyCounter spaceFrequencyCounter = new FloatFrequencyCounter();
|
|
||||||
StringFrequencyCounter fontFrequencyCounter = new StringFrequencyCounter();
|
|
||||||
StringFrequencyCounter styleFrequencyCounter = new StringFrequencyCounter();
|
|
||||||
|
|
||||||
for (TextPositionSequence wordBlock : wordBlockList) {
|
|
||||||
|
|
||||||
lineHeightFrequencyCounter.add(wordBlock.getTextHeight());
|
|
||||||
fontSizeFrequencyCounter.add(wordBlock.getFontSize());
|
|
||||||
spaceFrequencyCounter.add(wordBlock.getSpaceWidth());
|
|
||||||
fontFrequencyCounter.add(wordBlock.getFont());
|
|
||||||
styleFrequencyCounter.add(wordBlock.getFontStyle());
|
|
||||||
|
|
||||||
if (textBlock == null) {
|
|
||||||
textBlock = new TextPageBlock(wordBlock.getMinXDirAdj(),
|
|
||||||
wordBlock.getMaxXDirAdj(),
|
|
||||||
wordBlock.getMinYDirAdj(),
|
|
||||||
wordBlock.getMaxYDirAdj(),
|
|
||||||
wordBlockList,
|
|
||||||
wordBlock.getRotation());
|
|
||||||
} else {
|
|
||||||
TextPageBlock spatialEntity = textBlock.union(wordBlock);
|
|
||||||
textBlock.resize(spatialEntity.getMinX(), spatialEntity.getMinY(), spatialEntity.getWidth(), spatialEntity.getHeight());
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if (textBlock != null) {
|
|
||||||
textBlock.setMostPopularWordFont(fontFrequencyCounter.getMostPopular());
|
|
||||||
textBlock.setMostPopularWordStyle(styleFrequencyCounter.getMostPopular());
|
|
||||||
textBlock.setMostPopularWordFontSize(fontSizeFrequencyCounter.getMostPopular());
|
|
||||||
textBlock.setMostPopularWordHeight(lineHeightFrequencyCounter.getMostPopular());
|
|
||||||
textBlock.setMostPopularWordSpaceWidth(spaceFrequencyCounter.getMostPopular());
|
|
||||||
textBlock.setHighestFontSize(fontSizeFrequencyCounter.getHighest());
|
|
||||||
}
|
|
||||||
|
|
||||||
if (textBlock != null && textBlock.getSequences() != null && textBlock.getSequences().stream().map(t -> round(t.getMinYDirAdj(), 3)).collect(toSet()).size() == 1) {
|
|
||||||
textBlock.getSequences().sort(Comparator.comparing(TextPositionSequence::getMinXDirAdj));
|
|
||||||
}
|
|
||||||
return textBlock;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
private boolean isSplitByRuling(float minX,
|
|
||||||
float minY,
|
|
||||||
float maxX,
|
|
||||||
float maxY,
|
|
||||||
TextPositionSequence word,
|
|
||||||
List<Ruling> horizontalRulingLines,
|
|
||||||
List<Ruling> verticalRulingLines) {
|
|
||||||
|
|
||||||
return isSplitByRuling(maxX,
|
|
||||||
minY,
|
|
||||||
word.getMinXDirAdj(),
|
|
||||||
word.getMinYDirAdj(),
|
|
||||||
verticalRulingLines,
|
|
||||||
word.getDir().getDegrees(),
|
|
||||||
word.getPageWidth(),
|
|
||||||
word.getPageHeight()) //
|
|
||||||
|| isSplitByRuling(minX,
|
|
||||||
minY,
|
|
||||||
word.getMinXDirAdj(),
|
|
||||||
word.getMaxYDirAdj(),
|
|
||||||
horizontalRulingLines,
|
|
||||||
word.getDir().getDegrees(),
|
|
||||||
word.getPageWidth(),
|
|
||||||
word.getPageHeight()) //
|
|
||||||
|| isSplitByRuling(maxX,
|
|
||||||
minY,
|
|
||||||
word.getMinXDirAdj(),
|
|
||||||
word.getMinYDirAdj(),
|
|
||||||
horizontalRulingLines,
|
|
||||||
word.getDir().getDegrees(),
|
|
||||||
word.getPageWidth(),
|
|
||||||
word.getPageHeight()) //
|
|
||||||
|| isSplitByRuling(minX,
|
|
||||||
minY,
|
|
||||||
word.getMinXDirAdj(),
|
|
||||||
word.getMaxYDirAdj(),
|
|
||||||
verticalRulingLines,
|
|
||||||
word.getDir().getDegrees(),
|
|
||||||
word.getPageWidth(),
|
|
||||||
word.getPageHeight()); //
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
private boolean isSplitByRuling(float previousX2, float previousY1, float currentX1, float currentY1, List<Ruling> rulingLines, float dir, float pageWidth, float pageHeight) {
|
|
||||||
|
|
||||||
for (Ruling ruling : rulingLines) {
|
|
||||||
var line = RulingTextDirAdjustUtil.convertToDirAdj(ruling, dir, pageWidth, pageHeight);
|
|
||||||
if (line.intersectsLine(previousX2, previousY1, currentX1, currentY1)) {
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
private double round(float value, int decimalPoints) {
|
|
||||||
|
|
||||||
var d = Math.pow(10, decimalPoints);
|
|
||||||
return Math.round(value * d) / d;
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@ -13,14 +13,11 @@ import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBl
|
|||||||
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
|
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.FloatFrequencyCounter;
|
import com.knecon.fforesight.service.layoutparser.processor.model.FloatFrequencyCounter;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.Orientation;
|
import com.knecon.fforesight.service.layoutparser.processor.model.Orientation;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.Cell;
|
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings;
|
import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling;
|
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.StringFrequencyCounter;
|
import com.knecon.fforesight.service.layoutparser.processor.model.text.StringFrequencyCounter;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
|
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
|
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations;
|
import com.knecon.fforesight.service.layoutparser.processor.visualization.LayoutparsingVisualizations;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.utils.RulingTextDirAdjustUtil;
|
|
||||||
|
|
||||||
@SuppressWarnings("all")
|
@SuppressWarnings("all")
|
||||||
@Service
|
@Service
|
||||||
@ -34,12 +31,13 @@ public class RedactManagerBlockificationService {
|
|||||||
* This method must use text direction adjusted postions (DirAdj). Where {0,0} is on the upper left. Never try to change this!
|
* This method must use text direction adjusted postions (DirAdj). Where {0,0} is on the upper left. Never try to change this!
|
||||||
* Rulings (TablePageBlock lines) must be adjusted to the text directions as well, when checking if a block is split by a ruling.
|
* Rulings (TablePageBlock lines) must be adjusted to the text directions as well, when checking if a block is split by a ruling.
|
||||||
*
|
*
|
||||||
* @param textPositions The words of a page.
|
* @param textPositions The words of a page.
|
||||||
|
* @param visualizations
|
||||||
* @return Page object that contains the Textblock and text statistics.
|
* @return Page object that contains the Textblock and text statistics.
|
||||||
*/
|
*/
|
||||||
public ClassificationPage blockify(List<TextPositionSequence> textPositions, List<Cell> cells) {
|
public ClassificationPage blockify(List<TextPositionSequence> textPositions, CleanRulings cleanRulings, LayoutparsingVisualizations visualizations) {
|
||||||
|
|
||||||
CleanRulings usedRulings = RectangleTransformations.extractRulings(cells);
|
CleanRulings usedRulings = cleanRulings.withoutTextRulings();
|
||||||
|
|
||||||
int indexOnPage = 0;
|
int indexOnPage = 0;
|
||||||
List<TextPositionSequence> chunkWords = new ArrayList<>();
|
List<TextPositionSequence> chunkWords = new ArrayList<>();
|
||||||
@ -57,7 +55,7 @@ public class RedactManagerBlockificationService {
|
|||||||
boolean splitByX = prev != null && maxX + 50 < word.getMinXDirAdj() && prev.getMinYDirAdj() == word.getMinYDirAdj();
|
boolean splitByX = prev != null && maxX + 50 < word.getMinXDirAdj() && prev.getMinYDirAdj() == word.getMinYDirAdj();
|
||||||
boolean xIsBeforeFirstX = prev != null && word.getMinXDirAdj() < minX;
|
boolean xIsBeforeFirstX = prev != null && word.getMinXDirAdj() < minX;
|
||||||
boolean newLineAfterSplit = prev != null && word.getMinYDirAdj() != prev.getMinYDirAdj() && wasSplitted && splitX1 != word.getMinXDirAdj();
|
boolean newLineAfterSplit = prev != null && word.getMinYDirAdj() != prev.getMinYDirAdj() && wasSplitted && splitX1 != word.getMinXDirAdj();
|
||||||
boolean isSplitByRuling = isSplitByRuling(minX, minY, maxX, maxY, word, usedRulings.getHorizontal(), usedRulings.getVertical());
|
boolean isSplitByRuling = prev != null && usedRulings.lineBetween(prev, word);
|
||||||
boolean splitByDir = prev != null && !prev.getDir().equals(word.getDir());
|
boolean splitByDir = prev != null && !prev.getDir().equals(word.getDir());
|
||||||
|
|
||||||
if (prev != null && (lineSeparation || startFromTop || splitByX || splitByDir || isSplitByRuling)) {
|
if (prev != null && (lineSeparation || startFromTop || splitByX || splitByDir || isSplitByRuling)) {
|
||||||
@ -67,7 +65,7 @@ public class RedactManagerBlockificationService {
|
|||||||
prevOrientation = chunkBlockList.get(chunkBlockList.size() - 1).getOrientation();
|
prevOrientation = chunkBlockList.get(chunkBlockList.size() - 1).getOrientation();
|
||||||
}
|
}
|
||||||
|
|
||||||
TextPageBlock cb1 = buildTextBlock(chunkWords, indexOnPage);
|
TextPageBlock cb1 = new TextPageBlock(chunkWords);
|
||||||
indexOnPage++;
|
indexOnPage++;
|
||||||
|
|
||||||
chunkBlockList.add(cb1);
|
chunkBlockList.add(cb1);
|
||||||
@ -81,7 +79,11 @@ public class RedactManagerBlockificationService {
|
|||||||
wasSplitted = false;
|
wasSplitted = false;
|
||||||
cb1.setOrientation(Orientation.RIGHT);
|
cb1.setOrientation(Orientation.RIGHT);
|
||||||
splitX1 = null;
|
splitX1 = null;
|
||||||
} else if (prevOrientation != null && prevOrientation.equals(Orientation.RIGHT) && (lineSeparation || !startFromTop || !splitByX || !newLineAfterSplit || !isSplitByRuling)) {
|
} else if (prevOrientation != null && prevOrientation.equals(Orientation.RIGHT) && (lineSeparation
|
||||||
|
|| !startFromTop
|
||||||
|
|| !splitByX
|
||||||
|
|| !newLineAfterSplit
|
||||||
|
|| !isSplitByRuling)) {
|
||||||
cb1.setOrientation(Orientation.LEFT);
|
cb1.setOrientation(Orientation.LEFT);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -109,8 +111,8 @@ public class RedactManagerBlockificationService {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
TextPageBlock cb1 = buildTextBlock(chunkWords, indexOnPage);
|
if (!chunkWords.isEmpty()) {
|
||||||
if (cb1 != null) {
|
TextPageBlock cb1 = new TextPageBlock(chunkWords);
|
||||||
chunkBlockList.add(cb1);
|
chunkBlockList.add(cb1);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -150,8 +152,11 @@ public class RedactManagerBlockificationService {
|
|||||||
TextPageBlock block = (TextPageBlock) itty.next();
|
TextPageBlock block = (TextPageBlock) itty.next();
|
||||||
|
|
||||||
if (previous != null && previous.getOrientation().equals(Orientation.LEFT) && block.getOrientation().equals(Orientation.LEFT) && equalsWithThreshold(block.getMaxY(),
|
if (previous != null && previous.getOrientation().equals(Orientation.LEFT) && block.getOrientation().equals(Orientation.LEFT) && equalsWithThreshold(block.getMaxY(),
|
||||||
previous.getMaxY()) || previous != null && previous.getOrientation().equals(Orientation.LEFT) && block.getOrientation()
|
previous.getMaxY())
|
||||||
.equals(Orientation.RIGHT) && equalsWithThreshold(block.getMaxY(), previous.getMaxY())) {
|
|| previous != null
|
||||||
|
&& previous.getOrientation().equals(Orientation.LEFT)
|
||||||
|
&& block.getOrientation().equals(Orientation.RIGHT)
|
||||||
|
&& equalsWithThreshold(block.getMaxY(), previous.getMaxY())) {
|
||||||
previous.add(block);
|
previous.add(block);
|
||||||
itty.remove();
|
itty.remove();
|
||||||
continue;
|
continue;
|
||||||
@ -159,123 +164,19 @@ public class RedactManagerBlockificationService {
|
|||||||
|
|
||||||
previous = block;
|
previous = block;
|
||||||
}
|
}
|
||||||
|
if (!textPositions.isEmpty()) {
|
||||||
|
visualizations.addTextBlockVisualizations(chunkBlockList.stream()
|
||||||
|
.map(tb -> (TextPageBlock) tb)
|
||||||
|
.toList(), textPositions.get(0).getPage());
|
||||||
|
}
|
||||||
|
|
||||||
return new ClassificationPage(chunkBlockList);
|
return new ClassificationPage(chunkBlockList);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
private boolean equalsWithThreshold(float f1, float f2) {
|
private boolean equalsWithThreshold(double f1, double f2) {
|
||||||
|
|
||||||
return Math.abs(f1 - f2) < THRESHOLD;
|
return Math.abs(f1 - f2) < THRESHOLD;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
private TextPageBlock buildTextBlock(List<TextPositionSequence> wordBlockList, int indexOnPage) {
|
|
||||||
|
|
||||||
TextPageBlock textBlock = null;
|
|
||||||
|
|
||||||
FloatFrequencyCounter lineHeightFrequencyCounter = new FloatFrequencyCounter();
|
|
||||||
FloatFrequencyCounter fontSizeFrequencyCounter = new FloatFrequencyCounter();
|
|
||||||
FloatFrequencyCounter spaceFrequencyCounter = new FloatFrequencyCounter();
|
|
||||||
StringFrequencyCounter fontFrequencyCounter = new StringFrequencyCounter();
|
|
||||||
StringFrequencyCounter styleFrequencyCounter = new StringFrequencyCounter();
|
|
||||||
|
|
||||||
for (TextPositionSequence wordBlock : wordBlockList) {
|
|
||||||
|
|
||||||
lineHeightFrequencyCounter.add(wordBlock.getTextHeight());
|
|
||||||
fontSizeFrequencyCounter.add(wordBlock.getFontSize());
|
|
||||||
spaceFrequencyCounter.add(wordBlock.getSpaceWidth());
|
|
||||||
fontFrequencyCounter.add(wordBlock.getFont());
|
|
||||||
styleFrequencyCounter.add(wordBlock.getFontStyle());
|
|
||||||
|
|
||||||
if (textBlock == null) {
|
|
||||||
textBlock = new TextPageBlock(wordBlock.getMinXDirAdj(),
|
|
||||||
wordBlock.getMaxXDirAdj(),
|
|
||||||
wordBlock.getMinYDirAdj(),
|
|
||||||
wordBlock.getMaxYDirAdj(),
|
|
||||||
wordBlockList,
|
|
||||||
wordBlock.getRotation());
|
|
||||||
} else {
|
|
||||||
TextPageBlock spatialEntity = textBlock.union(wordBlock);
|
|
||||||
textBlock.resize(spatialEntity.getMinX(), spatialEntity.getMinY(), spatialEntity.getWidth(), spatialEntity.getHeight());
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if (textBlock != null) {
|
|
||||||
textBlock.setMostPopularWordFont(fontFrequencyCounter.getMostPopular());
|
|
||||||
textBlock.setMostPopularWordStyle(styleFrequencyCounter.getMostPopular());
|
|
||||||
textBlock.setMostPopularWordFontSize(fontSizeFrequencyCounter.getMostPopular());
|
|
||||||
textBlock.setMostPopularWordHeight(lineHeightFrequencyCounter.getMostPopular());
|
|
||||||
textBlock.setMostPopularWordSpaceWidth(spaceFrequencyCounter.getMostPopular());
|
|
||||||
textBlock.setHighestFontSize(fontSizeFrequencyCounter.getHighest());
|
|
||||||
}
|
|
||||||
|
|
||||||
if (textBlock != null && textBlock.getSequences() != null && textBlock.getSequences().stream().map(t -> round(t.getMinYDirAdj(), 3)).collect(toSet()).size() == 1) {
|
|
||||||
textBlock.getSequences().sort(Comparator.comparing(TextPositionSequence::getMinXDirAdj));
|
|
||||||
}
|
|
||||||
return textBlock;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
private boolean isSplitByRuling(float minX,
|
|
||||||
float minY,
|
|
||||||
float maxX,
|
|
||||||
float maxY,
|
|
||||||
TextPositionSequence word,
|
|
||||||
List<Ruling> horizontalRulingLines,
|
|
||||||
List<Ruling> verticalRulingLines) {
|
|
||||||
|
|
||||||
return isSplitByRuling(maxX,
|
|
||||||
minY,
|
|
||||||
word.getMinXDirAdj(),
|
|
||||||
word.getMinYDirAdj(),
|
|
||||||
verticalRulingLines,
|
|
||||||
word.getDir().getDegrees(),
|
|
||||||
word.getPageWidth(),
|
|
||||||
word.getPageHeight()) //
|
|
||||||
|| isSplitByRuling(minX,
|
|
||||||
minY,
|
|
||||||
word.getMinXDirAdj(),
|
|
||||||
word.getMaxYDirAdj(),
|
|
||||||
horizontalRulingLines,
|
|
||||||
word.getDir().getDegrees(),
|
|
||||||
word.getPageWidth(),
|
|
||||||
word.getPageHeight()) //
|
|
||||||
|| isSplitByRuling(maxX,
|
|
||||||
minY,
|
|
||||||
word.getMinXDirAdj(),
|
|
||||||
word.getMinYDirAdj(),
|
|
||||||
horizontalRulingLines,
|
|
||||||
word.getDir().getDegrees(),
|
|
||||||
word.getPageWidth(),
|
|
||||||
word.getPageHeight()) //
|
|
||||||
|| isSplitByRuling(minX,
|
|
||||||
minY,
|
|
||||||
word.getMinXDirAdj(),
|
|
||||||
word.getMaxYDirAdj(),
|
|
||||||
verticalRulingLines,
|
|
||||||
word.getDir().getDegrees(),
|
|
||||||
word.getPageWidth(),
|
|
||||||
word.getPageHeight());
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
private boolean isSplitByRuling(float previousX2, float previousY1, float currentX1, float currentY1, List<Ruling> rulingLines, float dir, float pageWidth, float pageHeight) {
|
|
||||||
|
|
||||||
for (Ruling ruling : rulingLines) {
|
|
||||||
var line = RulingTextDirAdjustUtil.convertToDirAdj(ruling, dir, pageWidth, pageHeight);
|
|
||||||
if (line.intersectsLine(previousX2, previousY1, currentX1, currentY1)) {
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
private double round(float value, int decimalPoints) {
|
|
||||||
|
|
||||||
var d = Math.pow(10, decimalPoints);
|
|
||||||
return Math.round(value * d) / d;
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|||||||
@ -14,6 +14,8 @@ import com.knecon.fforesight.service.layoutparser.processor.model.Classification
|
|||||||
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
|
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.PageBlockType;
|
import com.knecon.fforesight.service.layoutparser.processor.model.PageBlockType;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
|
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.utils.HeaderFooterDetection;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.utils.MarkedContentUtils;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.utils.PositionUtils;
|
import com.knecon.fforesight.service.layoutparser.processor.utils.PositionUtils;
|
||||||
|
|
||||||
import lombok.RequiredArgsConstructor;
|
import lombok.RequiredArgsConstructor;
|
||||||
@ -25,7 +27,7 @@ import lombok.extern.slf4j.Slf4j;
|
|||||||
public class DocuMineClassificationService {
|
public class DocuMineClassificationService {
|
||||||
|
|
||||||
private final HeadlineClassificationService headlineClassificationService;
|
private final HeadlineClassificationService headlineClassificationService;
|
||||||
private static final Pattern pattern = Pattern.compile("^(\\d{1,2}\\.){1,3}\\d{1,2}\\.?\\s[0-9A-Za-z \\[\\]]{2,50}", Pattern.CASE_INSENSITIVE);
|
private static final Pattern pattern = Pattern.compile("^([1-9]\\d?\\.){1,3}\\d{1,2}\\.?\\s[0-9A-Za-z \\[\\]]{2,50}", Pattern.CASE_INSENSITIVE);
|
||||||
private static final Pattern pattern2 = Pattern.compile("\\p{L}{3,}", Pattern.CASE_INSENSITIVE);
|
private static final Pattern pattern2 = Pattern.compile("\\p{L}{3,}", Pattern.CASE_INSENSITIVE);
|
||||||
private static final Pattern pattern3 = Pattern.compile("^(\\d{1,1}\\.){1,3}\\d{1,2}\\.?\\s[a-z]{1,2}\\/[a-z]{1,2}.*");
|
private static final Pattern pattern3 = Pattern.compile("^(\\d{1,1}\\.){1,3}\\d{1,2}\\.?\\s[a-z]{1,2}\\/[a-z]{1,2}.*");
|
||||||
|
|
||||||
@ -72,15 +74,26 @@ public class DocuMineClassificationService {
|
|||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
if (MarkedContentUtils.intersects(textBlock, page.getMarkedContentBboxPerType(), MarkedContentUtils.HEADER)
|
if (MarkedContentUtils.intersects(textBlock, page.getMarkedContentBboxPerType(), MarkedContentUtils.HEADER)
|
||||||
|| PositionUtils.isOverBodyTextFrame(bodyTextFrame, textBlock, page.getRotation()) && (document.getFontSizeCounter().getMostPopular() == null
|
|| (PositionUtils.isOverBodyTextFrame(bodyTextFrame,
|
||||||
|| textBlock.getHighestFontSize() <= document.getFontSizeCounter()
|
textBlock,
|
||||||
.getMostPopular())) {
|
page.getRotation()) && (document.getFontSizeCounter().getMostPopular()
|
||||||
|
== null
|
||||||
|
|| textBlock.getHighestFontSize()
|
||||||
|
<= document.getFontSizeCounter()
|
||||||
|
.getMostPopular()))
|
||||||
|
|| HeaderFooterDetection.isLikelyHeader(textBlock, document, page)) {
|
||||||
textBlock.setClassification(PageBlockType.HEADER);
|
textBlock.setClassification(PageBlockType.HEADER);
|
||||||
|
|
||||||
} else if (MarkedContentUtils.intersects(textBlock, page.getMarkedContentBboxPerType(), MarkedContentUtils.FOOTER)
|
} else if (MarkedContentUtils.intersects(textBlock, page.getMarkedContentBboxPerType(), MarkedContentUtils.FOOTER)
|
||||||
|| PositionUtils.isUnderBodyTextFrame(bodyTextFrame, textBlock, page.getRotation()) && (document.getFontSizeCounter().getMostPopular() == null
|
|| (PositionUtils.isUnderBodyTextFrame(bodyTextFrame,
|
||||||
|| textBlock.getHighestFontSize() <= document.getFontSizeCounter()
|
textBlock,
|
||||||
.getMostPopular())) {
|
page.getRotation())
|
||||||
|
&& (document.getFontSizeCounter().getMostPopular()
|
||||||
|
== null
|
||||||
|
|| textBlock.getHighestFontSize()
|
||||||
|
<= document.getFontSizeCounter()
|
||||||
|
.getMostPopular()))
|
||||||
|
|| HeaderFooterDetection.isLikelyFooter(textBlock, document, page)) {
|
||||||
textBlock.setClassification(PageBlockType.FOOTER);
|
textBlock.setClassification(PageBlockType.FOOTER);
|
||||||
} else if (page.getPageNumber() == 1 && (PositionUtils.getHeightDifferenceBetweenChunkWordAndDocumentWord(textBlock, document.getTextHeightCounter().getMostPopular()) > 2.5
|
} else if (page.getPageNumber() == 1 && (PositionUtils.getHeightDifferenceBetweenChunkWordAndDocumentWord(textBlock, document.getTextHeightCounter().getMostPopular()) > 2.5
|
||||||
&& textBlock.getHighestFontSize() > document.getFontSizeCounter().getMostPopular() || page.getTextBlocks().size() == 1)) {
|
&& textBlock.getHighestFontSize() > document.getFontSizeCounter().getMostPopular() || page.getTextBlocks().size() == 1)) {
|
||||||
@ -92,19 +105,19 @@ public class DocuMineClassificationService {
|
|||||||
|| textBlock.getMostPopularWordFontSize() > document.getFontSizeCounter().getMostPopular())
|
|| textBlock.getMostPopularWordFontSize() > document.getFontSizeCounter().getMostPopular())
|
||||||
&& PositionUtils.getApproxLineCount(textBlock) < 5.9
|
&& PositionUtils.getApproxLineCount(textBlock) < 5.9
|
||||||
|
|
||||||
&& (textBlock.getMostPopularWordStyle().contains("bold") && Character.isDigit(textBlock.toString().charAt(0)) && !matcher2.matches() && !textBlock.toString()
|
&& (textBlock.getMostPopularWordStyle().contains("bold") && Character.isDigit(textBlock.toString().charAt(0)) && matcher2.reset().find() && !textBlock.toString()
|
||||||
.contains(":")
|
.contains(":")
|
||||||
|| textBlock.toString().equals(textBlock.toString().toUpperCase(Locale.ROOT)) && !matcher2.matches() && !textBlock.toString().contains(":")
|
|| textBlock.toString().equals(textBlock.toString().toUpperCase(Locale.ROOT)) && matcher2.reset().find() && !textBlock.toString().contains(":")
|
||||||
|| textBlock.toString().startsWith("APPENDIX")
|
|| textBlock.toString().startsWith("APPENDIX")
|
||||||
|| textBlock.toString().startsWith("FIGURE")
|
|| textBlock.toString().startsWith("FIGURE")
|
||||||
|| textBlock.toString().startsWith("TABLE"))
|
|| textBlock.toString().startsWith("TABLE"))
|
||||||
&& !textBlock.toString().endsWith(":")
|
&& !textBlock.toString().endsWith(":")
|
||||||
&& matcher2.find()) {
|
&& matcher2.reset().find()) {
|
||||||
PageBlockType headlineType = PageBlockType.getHeadlineType(1);
|
PageBlockType headlineType = PageBlockType.getHeadlineType(1);
|
||||||
headlineClassificationService.classifyHeadline(textBlock, headlineType);
|
headlineClassificationService.classifyHeadline(textBlock, headlineType);
|
||||||
document.setHeadlines(true);
|
document.setHeadlines(true);
|
||||||
|
|
||||||
} else if (matcher.find() && PositionUtils.getApproxLineCount(textBlock) < 2.9 && matcher2.find() && !matcher3.matches()) {
|
} else if (matcher.reset().find() && PositionUtils.getApproxLineCount(textBlock) < 2.9 && matcher2.reset().find() && !matcher3.reset().matches()) {
|
||||||
PageBlockType headlineType = PageBlockType.getHeadlineType(2);
|
PageBlockType headlineType = PageBlockType.getHeadlineType(2);
|
||||||
headlineClassificationService.classifyHeadline(textBlock, headlineType);
|
headlineClassificationService.classifyHeadline(textBlock, headlineType);
|
||||||
document.setHeadlines(true);
|
document.setHeadlines(true);
|
||||||
|
|||||||
@ -52,6 +52,9 @@ public class DocumentGraphFactory {
|
|||||||
public Document buildDocumentGraph(LayoutParsingType layoutParsingType, ClassificationDocument document) {
|
public Document buildDocumentGraph(LayoutParsingType layoutParsingType, ClassificationDocument document) {
|
||||||
|
|
||||||
Document documentGraph = new Document();
|
Document documentGraph = new Document();
|
||||||
|
|
||||||
|
documentGraph.setVisualizations(document.getVisualizations());
|
||||||
|
|
||||||
Context context = new Context(documentGraph);
|
Context context = new Context(documentGraph);
|
||||||
|
|
||||||
document.getPages()
|
document.getPages()
|
||||||
@ -88,14 +91,18 @@ public class DocumentGraphFactory {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
public void addParagraphOrHeadline(GenericSemanticNode parentNode, TextPageBlock originalTextBlock, Context context, List<TextPageBlock> textBlocksToMerge) {
|
public void addParagraphOrHeadline(GenericSemanticNode parentNode,
|
||||||
|
TextPageBlock originalTextBlock,
|
||||||
|
Context context,
|
||||||
|
List<TextPageBlock> textBlocksToMerge,
|
||||||
|
LayoutParsingType layoutParsingType) {
|
||||||
|
|
||||||
Page page = context.getPage(originalTextBlock.getPage());
|
Page page = context.getPage(originalTextBlock.getPage());
|
||||||
|
|
||||||
GenericSemanticNode node;
|
GenericSemanticNode node;
|
||||||
if (originalTextBlock.isHeadline()) {
|
if (originalTextBlock.isHeadline()) {
|
||||||
node = Headline.builder().documentTree(context.getDocumentTree()).build();
|
node = Headline.builder().documentTree(context.getDocumentTree()).build();
|
||||||
} else if (originalTextBlock.isToDuplicate()) {
|
} else if (originalTextBlock.isToDuplicate() && layoutParsingType.equals(LayoutParsingType.REDACT_MANAGER)) {
|
||||||
node = DuplicatedParagraph.builder().documentTree(context.getDocumentTree()).build();
|
node = DuplicatedParagraph.builder().documentTree(context.getDocumentTree()).build();
|
||||||
} else {
|
} else {
|
||||||
node = Paragraph.builder().documentTree(context.getDocumentTree()).build();
|
node = Paragraph.builder().documentTree(context.getDocumentTree()).build();
|
||||||
@ -274,8 +281,7 @@ public class DocumentGraphFactory {
|
|||||||
return pages.keySet()
|
return pages.keySet()
|
||||||
.stream()
|
.stream()
|
||||||
.filter(page -> page.getNumber() == pageIndex)
|
.filter(page -> page.getNumber() == pageIndex)
|
||||||
.findFirst()
|
.findFirst().orElseThrow(() -> new NoSuchElementException(format("ClassificationPage with number %d not found", pageIndex)));
|
||||||
.orElseThrow(() -> new NoSuchElementException(format("ClassificationPage with number %d not found", pageIndex)));
|
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|||||||
@ -2,6 +2,7 @@ package com.knecon.fforesight.service.layoutparser.processor.services.factory;
|
|||||||
|
|
||||||
import java.awt.geom.AffineTransform;
|
import java.awt.geom.AffineTransform;
|
||||||
import java.awt.geom.Rectangle2D;
|
import java.awt.geom.Rectangle2D;
|
||||||
|
import java.util.Collection;
|
||||||
import java.util.Collections;
|
import java.util.Collections;
|
||||||
import java.util.LinkedList;
|
import java.util.LinkedList;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
@ -29,19 +30,22 @@ public class SearchTextWithTextPositionFactory {
|
|||||||
|
|
||||||
public SearchTextWithTextPositionDto buildSearchTextToTextPositionDto(List<TextPositionSequence> sequences) {
|
public SearchTextWithTextPositionDto buildSearchTextToTextPositionDto(List<TextPositionSequence> sequences) {
|
||||||
|
|
||||||
if (sequences.isEmpty() || sequences.stream().allMatch(sequence -> sequence.getTextPositions().isEmpty())) {
|
if (sequences.isEmpty() || sequences.stream()
|
||||||
|
.allMatch(sequence -> sequence.getTextPositions().isEmpty())) {
|
||||||
return SearchTextWithTextPositionDto.empty();
|
return SearchTextWithTextPositionDto.empty();
|
||||||
}
|
}
|
||||||
|
|
||||||
Context context = new Context();
|
Context context = new Context();
|
||||||
|
|
||||||
RedTextPosition currentTextPosition = sequences.get(0).getTextPositions().get(0);
|
RedTextPosition currentTextPosition = sequences.get(0).getTextPositions()
|
||||||
RedTextPosition previousTextPosition = RedTextPosition.builder().unicode(" ").position(currentTextPosition.getPosition()).build();
|
.get(0);
|
||||||
|
RedTextPosition previousTextPosition = RedTextPosition.builder().unicode(" ").bBoxDirAdj(currentTextPosition.getBBoxDirAdj()).build();
|
||||||
|
|
||||||
for (TextPositionSequence word : sequences) {
|
for (TextPositionSequence word : sequences) {
|
||||||
for (int i = 0; i < word.getTextPositions().size(); ++i) {
|
for (int i = 0; i < word.getTextPositions().size(); ++i) {
|
||||||
|
|
||||||
currentTextPosition = word.getTextPositions().get(i);
|
currentTextPosition = word.getTextPositions()
|
||||||
|
.get(i);
|
||||||
if (isLineBreak(currentTextPosition, previousTextPosition)) {
|
if (isLineBreak(currentTextPosition, previousTextPosition)) {
|
||||||
removeHyphenLinebreaks(context);
|
removeHyphenLinebreaks(context);
|
||||||
context.lineBreaksStringIdx.add(context.stringIdx);
|
context.lineBreaksStringIdx.add(context.stringIdx);
|
||||||
@ -57,18 +61,21 @@ public class SearchTextWithTextPositionFactory {
|
|||||||
++context.positionIdx;
|
++context.positionIdx;
|
||||||
}
|
}
|
||||||
|
|
||||||
previousTextPosition = RedTextPosition.builder().unicode(" ").position(previousTextPosition.getPosition()).build();
|
previousTextPosition = RedTextPosition.builder().unicode(" ").bBoxDirAdj(previousTextPosition.getBBoxDirAdj()).build();
|
||||||
context.stringBuilder.append(" ");
|
context.stringBuilder.append(" ");
|
||||||
context.stringIdxToPositionIdx.add(context.positionIdx);
|
context.stringIdxToPositionIdx.add(context.positionIdx);
|
||||||
++context.stringIdx;
|
++context.stringIdx;
|
||||||
}
|
}
|
||||||
|
|
||||||
assert context.stringBuilder.length() == context.stringIdxToPositionIdx.size();
|
|
||||||
|
|
||||||
List<Rectangle2D> positions = sequences.stream()
|
List<Rectangle2D> positions = sequences.stream()
|
||||||
.flatMap(sequence -> sequence.getTextPositions().stream().map(textPosition -> mapRedTextPositionToInitialUserSpace(textPosition, sequence)))
|
.map(TextPositionSequence::getTextPositions)
|
||||||
|
.flatMap(Collection::stream)
|
||||||
|
.map(RedTextPosition::getBBoxInitialUserSpace)
|
||||||
.toList();
|
.toList();
|
||||||
|
|
||||||
|
assert context.stringBuilder.length() == context.stringIdxToPositionIdx.size();
|
||||||
|
|
||||||
return SearchTextWithTextPositionDto.builder()
|
return SearchTextWithTextPositionDto.builder()
|
||||||
.searchText(context.stringBuilder.toString())
|
.searchText(context.stringBuilder.toString())
|
||||||
.lineBreaks(context.lineBreaksStringIdx)
|
.lineBreaks(context.lineBreaksStringIdx)
|
||||||
@ -153,7 +160,7 @@ public class SearchTextWithTextPositionFactory {
|
|||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
float deltaY = Math.abs(currentPosition.getYDirAdj() - previousPosition.getYDirAdj());
|
double deltaY = Math.abs(currentPosition.getYDirAdj() - previousPosition.getYDirAdj());
|
||||||
return deltaY >= currentPosition.getHeightDir();
|
return deltaY >= currentPosition.getHeightDir();
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -167,16 +174,16 @@ public class SearchTextWithTextPositionFactory {
|
|||||||
private boolean isHyphen(String unicodeCharacter) {
|
private boolean isHyphen(String unicodeCharacter) {
|
||||||
|
|
||||||
return Objects.equals(unicodeCharacter, "-") || //
|
return Objects.equals(unicodeCharacter, "-") || //
|
||||||
Objects.equals(unicodeCharacter, "~") || //
|
Objects.equals(unicodeCharacter, "~") || //
|
||||||
Objects.equals(unicodeCharacter, "‐") || //
|
Objects.equals(unicodeCharacter, "‐") || //
|
||||||
Objects.equals(unicodeCharacter, "‒") || //
|
Objects.equals(unicodeCharacter, "‒") || //
|
||||||
Objects.equals(unicodeCharacter, "⁻") || //
|
Objects.equals(unicodeCharacter, "⁻") || //
|
||||||
Objects.equals(unicodeCharacter, "−") || //
|
Objects.equals(unicodeCharacter, "−") || //
|
||||||
Objects.equals(unicodeCharacter, "﹣") || //
|
Objects.equals(unicodeCharacter, "﹣") || //
|
||||||
Objects.equals(unicodeCharacter, "゠") || //
|
Objects.equals(unicodeCharacter, "゠") || //
|
||||||
Objects.equals(unicodeCharacter, "⁓") || //
|
Objects.equals(unicodeCharacter, "⁓") || //
|
||||||
Objects.equals(unicodeCharacter, "‑") || //
|
Objects.equals(unicodeCharacter, "‑") || //
|
||||||
Objects.equals(unicodeCharacter, "\u00AD");
|
Objects.equals(unicodeCharacter, "\u00AD");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@ -140,15 +140,15 @@ public class SectionNodeFactory {
|
|||||||
if (abstractPageBlock instanceof TextPageBlock) {
|
if (abstractPageBlock instanceof TextPageBlock) {
|
||||||
|
|
||||||
switch (layoutParsingType) {
|
switch (layoutParsingType) {
|
||||||
case REDACT_MANAGER, CLARIFYND, CLARIFYND_PARAGRAPH_DEBUG -> {
|
case REDACT_MANAGER, DOCUMINE, CLARIFYND, CLARIFYND_PARAGRAPH_DEBUG -> {
|
||||||
alreadyMerged.add(abstractPageBlock);
|
alreadyMerged.add(abstractPageBlock);
|
||||||
remainingBlocks.remove(abstractPageBlock);
|
remainingBlocks.remove(abstractPageBlock);
|
||||||
DocumentGraphFactory.addParagraphOrHeadline(section, (TextPageBlock) abstractPageBlock, context, new ArrayList<>());
|
DocumentGraphFactory.addParagraphOrHeadline(section, (TextPageBlock) abstractPageBlock, context, new ArrayList<>(), layoutParsingType);
|
||||||
}
|
}
|
||||||
default -> {
|
default -> {
|
||||||
List<TextPageBlock> textBlocks = findTextBlocksWithSameClassificationAndAlignsY((TextPageBlock) abstractPageBlock, remainingBlocks);
|
List<TextPageBlock> textBlocks = findTextBlocksWithSameClassificationAndAlignsY((TextPageBlock) abstractPageBlock, remainingBlocks);
|
||||||
alreadyMerged.addAll(textBlocks);
|
alreadyMerged.addAll(textBlocks);
|
||||||
DocumentGraphFactory.addParagraphOrHeadline(section, (TextPageBlock) abstractPageBlock, context, textBlocks);
|
DocumentGraphFactory.addParagraphOrHeadline(section, (TextPageBlock) abstractPageBlock, context, textBlocks, layoutParsingType);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
} else if (abstractPageBlock instanceof TablePageBlock tablePageBlock) {
|
} else if (abstractPageBlock instanceof TablePageBlock tablePageBlock) {
|
||||||
|
|||||||
@ -45,7 +45,10 @@ public class TableNodeFactory {
|
|||||||
.flatMap(Collection::stream)
|
.flatMap(Collection::stream)
|
||||||
.toList();
|
.toList();
|
||||||
|
|
||||||
Table table = Table.builder().documentTree(context.getDocumentTree()).numberOfCols(mergedRows.isEmpty() ? 0 : mergedRows.get(0).size()).numberOfRows(mergedRows.size())
|
Table table = Table.builder()
|
||||||
|
.documentTree(context.getDocumentTree())
|
||||||
|
.numberOfCols(mergedRows.isEmpty() ? 0 : mergedRows.get(0).size())
|
||||||
|
.numberOfRows(mergedRows.size())
|
||||||
.build();
|
.build();
|
||||||
|
|
||||||
pages.forEach(page -> addTableToPage(page, parentNode, table));
|
pages.forEach(page -> addTableToPage(page, parentNode, table));
|
||||||
@ -128,7 +131,12 @@ public class TableNodeFactory {
|
|||||||
|
|
||||||
Page page = context.getPage(cell.getPageNumber());
|
Page page = context.getPage(cell.getPageNumber());
|
||||||
|
|
||||||
TableCell tableCell = TableCell.builder().documentTree(context.getDocumentTree()).row(rowIndex).col(colIndex).header(cell.isHeaderCell()).bBox(cell.getBounds2D())
|
TableCell tableCell = TableCell.builder()
|
||||||
|
.documentTree(context.getDocumentTree())
|
||||||
|
.row(rowIndex)
|
||||||
|
.col(colIndex)
|
||||||
|
.header(cell.isHeaderCell())
|
||||||
|
.bBox(cell.getBBoxInitialUserSpace())
|
||||||
.build();
|
.build();
|
||||||
page.getMainBody().add(tableCell);
|
page.getMainBody().add(tableCell);
|
||||||
|
|
||||||
@ -160,7 +168,7 @@ public class TableNodeFactory {
|
|||||||
tableCell.setLeafTextBlock(textBlock);
|
tableCell.setLeafTextBlock(textBlock);
|
||||||
} else {
|
} else {
|
||||||
cell.getTextBlocks()
|
cell.getTextBlocks()
|
||||||
.forEach(tb -> DocumentGraphFactory.addParagraphOrHeadline(tableCell, tb, context, emptyList()));
|
.forEach(tb -> DocumentGraphFactory.addParagraphOrHeadline(tableCell, tb, context, emptyList(), layoutParsingType));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@ -13,6 +13,9 @@ import org.apache.pdfbox.rendering.ImageType;
|
|||||||
import org.apache.pdfbox.rendering.PDFRenderer;
|
import org.apache.pdfbox.rendering.PDFRenderer;
|
||||||
import org.springframework.stereotype.Service;
|
import org.springframework.stereotype.Service;
|
||||||
|
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.utils.CoordinateTransforms;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.utils.PageInformation;
|
||||||
|
|
||||||
import lombok.SneakyThrows;
|
import lombok.SneakyThrows;
|
||||||
|
|
||||||
@Service
|
@Service
|
||||||
@ -30,7 +33,7 @@ public class FindGraphicsRaster {
|
|||||||
|
|
||||||
var renderer = new PDFRenderer(doc);
|
var renderer = new PDFRenderer(doc);
|
||||||
var img = renderer.renderImageWithDPI(pageInformation.number() - 1, DPI, ImageType.GRAY);
|
var img = renderer.renderImageWithDPI(pageInformation.number() - 1, DPI, ImageType.GRAY);
|
||||||
var imageCtm = getImageCTM(pageInformation, img.getWidth());
|
var imageCtm = CoordinateTransforms.calculateImageCoordsToInitialUserSpaceCoords(pageInformation, CoordinateTransforms.calculateScalingFactor(pageInformation, img.getWidth()));
|
||||||
return findCCBoundingBoxes(img, remove, THRESHOLD, DPI / 72, imageCtm);
|
return findCCBoundingBoxes(img, remove, THRESHOLD, DPI / 72, imageCtm);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -131,42 +134,4 @@ public class FindGraphicsRaster {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
public AffineTransform getImageCTM(PageInformation pageInformation, int imageWidth) {
|
|
||||||
|
|
||||||
double scalingFactor = calculateScalingFactor(pageInformation, imageWidth);
|
|
||||||
AffineTransform imageToCropBoxScaling = new AffineTransform(scalingFactor, 0, 0, scalingFactor, -pageInformation.minX(), -pageInformation.minY());
|
|
||||||
|
|
||||||
AffineTransform mirrorMatrix = new AffineTransform(1, 0, 0, -1, 0, pageInformation.height());
|
|
||||||
|
|
||||||
AffineTransform rotationMatrix = switch (pageInformation.rotationDegrees()) {
|
|
||||||
case 90 -> new AffineTransform(0, 1, -1, 0, pageInformation.height(), 0);
|
|
||||||
case 180 -> new AffineTransform(-1, 0, 0, -1, pageInformation.width(), pageInformation.height());
|
|
||||||
case 270 -> new AffineTransform(0, -1, 1, 0, pageInformation.width() - pageInformation.height(), pageInformation.height()); // results from 90 + 180 rotations
|
|
||||||
default -> new AffineTransform();
|
|
||||||
};
|
|
||||||
|
|
||||||
// matrix multiplication is performed from right to left, so the order is reversed.
|
|
||||||
// scaling -> mirror -> rotation
|
|
||||||
AffineTransform resultMatrix = new AffineTransform();
|
|
||||||
|
|
||||||
resultMatrix.concatenate(rotationMatrix);
|
|
||||||
resultMatrix.concatenate(mirrorMatrix);
|
|
||||||
resultMatrix.concatenate(imageToCropBoxScaling);
|
|
||||||
return resultMatrix;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
private double calculateScalingFactor(PageInformation pageInformation, int imageWidth) {
|
|
||||||
|
|
||||||
// PDFBox always returns page height and width based on rotation
|
|
||||||
double pageWidth;
|
|
||||||
if (pageInformation.rotationDegrees() == 90 || pageInformation.rotationDegrees() == 270) {
|
|
||||||
pageWidth = pageInformation.height();
|
|
||||||
} else {
|
|
||||||
pageWidth = pageInformation.width();
|
|
||||||
}
|
|
||||||
|
|
||||||
return pageWidth / imageWidth;
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|||||||
@ -1,7 +1,6 @@
|
|||||||
package com.knecon.fforesight.service.layoutparser.processor.services.graphics;
|
package com.knecon.fforesight.service.layoutparser.processor.services.graphics;
|
||||||
|
|
||||||
import java.awt.geom.Rectangle2D;
|
import java.awt.geom.Rectangle2D;
|
||||||
import java.util.ArrayList;
|
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.stream.Collectors;
|
import java.util.stream.Collectors;
|
||||||
|
|
||||||
@ -9,10 +8,11 @@ import org.apache.pdfbox.pdmodel.PDDocument;
|
|||||||
import org.apache.pdfbox.pdmodel.PDPage;
|
import org.apache.pdfbox.pdmodel.PDPage;
|
||||||
import org.springframework.stereotype.Service;
|
import org.springframework.stereotype.Service;
|
||||||
|
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.Cell;
|
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings;
|
import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
|
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.services.factory.SearchTextWithTextPositionFactory;
|
import com.knecon.fforesight.service.layoutparser.processor.services.factory.SearchTextWithTextPositionFactory;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.utils.PageInformation;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations;
|
import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations;
|
||||||
|
|
||||||
import lombok.RequiredArgsConstructor;
|
import lombok.RequiredArgsConstructor;
|
||||||
@ -22,6 +22,9 @@ import lombok.SneakyThrows;
|
|||||||
@RequiredArgsConstructor
|
@RequiredArgsConstructor
|
||||||
public class GraphicExtractorService {
|
public class GraphicExtractorService {
|
||||||
|
|
||||||
|
private static final int MIN_GRAPHICS_SIDE_LENGTH = 30;
|
||||||
|
private static final int MIN_GRAPHICS_AREA = 500;
|
||||||
|
|
||||||
private final GraphicsClusteringService graphicsClusteringService;
|
private final GraphicsClusteringService graphicsClusteringService;
|
||||||
private final FindGraphicsRaster findGraphicsRaster;
|
private final FindGraphicsRaster findGraphicsRaster;
|
||||||
|
|
||||||
@ -32,33 +35,32 @@ public class GraphicExtractorService {
|
|||||||
int pageNumber,
|
int pageNumber,
|
||||||
CleanRulings cleanRulings,
|
CleanRulings cleanRulings,
|
||||||
List<TextPositionSequence> textPositionSequences,
|
List<TextPositionSequence> textPositionSequences,
|
||||||
List<Cell> emptyTableCells,
|
|
||||||
boolean graphicsRaster) {
|
boolean graphicsRaster) {
|
||||||
|
|
||||||
var characterBBoxes = getCharacterBBoxes(textPositionSequences);
|
List<Box> characterBBoxes = getCharacterBBoxes(textPositionSequences);
|
||||||
var tableLineBBoxes = getLineBBoxesFromTableCells(emptyTableCells);
|
List<Box> classifiedRulingsBoxes = getLineBBoxesOfAllClassifiedRulings(cleanRulings);
|
||||||
var underLineBBoxes = getUnderlineBBoxes(cleanRulings, characterBBoxes);
|
|
||||||
var strikeThroughBBoxes = getStrikeThroughBBoxes(cleanRulings, characterBBoxes);
|
|
||||||
|
|
||||||
GraphicBBDetector graphicBBDetector = new GraphicBBDetector(pdPage, true);
|
GraphicBBDetector graphicBBDetector = new GraphicBBDetector(pdPage, true);
|
||||||
var graphicBBoxes = graphicBBDetector.findGraphicBB();
|
List<Box> graphicBBoxes = graphicBBDetector.findGraphicBB();
|
||||||
|
|
||||||
if (graphicsRaster) {
|
if (graphicsRaster) {
|
||||||
// This should only be used if ocr was performed, it is currently in an early stage and needs to be improved.
|
// This should only be used if ocr was performed, it is currently in an early stage and needs to be improved.
|
||||||
graphicBBoxes.addAll(findGraphicsRaster.findCCBoundingBoxes(pdDocument,
|
graphicBBoxes.addAll(findGraphicsRaster.findCCBoundingBoxes(pdDocument,
|
||||||
characterBBoxes.stream().map(box -> new Rectangle2D.Double(box.x1 - 2, box.y1 - 2, box.width() + 4, box.height() + 4)).collect(Collectors.toList()),
|
characterBBoxes.stream()
|
||||||
PageInformation.fromPDPage(pageNumber, pdPage)));
|
.map(box -> new Rectangle2D.Double(box.x1 - 2, box.y1 - 2, box.width() + 4, box.height() + 4))
|
||||||
|
.collect(Collectors.toList()),
|
||||||
|
PageInformation.fromPDPage(pageNumber, pdPage)));
|
||||||
}
|
}
|
||||||
|
|
||||||
var filteredGraphicBBoxes = graphicBBoxes.stream()
|
List<Box> filteredGraphicBBoxes = graphicBBoxes.stream()
|
||||||
.filter(box -> !box.intersectsAny(tableLineBBoxes, 4))
|
.filter(box -> !box.intersectsAny(classifiedRulingsBoxes, 4))
|
||||||
.filter(box -> !box.intersectsAny(underLineBBoxes, 4))
|
|
||||||
.filter(box -> !box.intersectsAny(strikeThroughBBoxes, 4))
|
|
||||||
.collect(Collectors.toList());
|
.collect(Collectors.toList());
|
||||||
|
|
||||||
var clusters = graphicsClusteringService.getClusters(filteredGraphicBBoxes, 14);
|
List<Box> clusters = graphicsClusteringService.getClusters(filteredGraphicBBoxes, 14);
|
||||||
|
|
||||||
return clusters.stream().filter(box -> box.area() > 500 && box.height() > 50 && box.width() > 50).toList();
|
return clusters.stream()
|
||||||
|
.filter(box -> box.area() > MIN_GRAPHICS_AREA && box.height() > MIN_GRAPHICS_SIDE_LENGTH && box.width() > MIN_GRAPHICS_SIDE_LENGTH)
|
||||||
|
.toList();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@ -74,34 +76,13 @@ public class GraphicExtractorService {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
private List<Box> getLineBBoxesFromTableCells(List<Cell> emptyTableCells) {
|
private List<Box> getLineBBoxesOfAllClassifiedRulings(CleanRulings cleanRulings) {
|
||||||
|
|
||||||
List<Box> expandedTableLines = new ArrayList<>();
|
return cleanRulings.buildAll()
|
||||||
|
|
||||||
emptyTableCells.forEach(cell -> {
|
|
||||||
expandedTableLines.add(new Box(new Rectangle2D.Double(cell.x, cell.y - 1, cell.width, 2)));
|
|
||||||
expandedTableLines.add(new Box(new Rectangle2D.Double(cell.x, cell.y + cell.height - 1, cell.width, 2)));
|
|
||||||
expandedTableLines.add(new Box(new Rectangle2D.Double(cell.x - 1, cell.y, 2, cell.height)));
|
|
||||||
expandedTableLines.add(new Box(new Rectangle2D.Double(cell.x + cell.width - 1, cell.y, 2, cell.height)));
|
|
||||||
});
|
|
||||||
|
|
||||||
return expandedTableLines;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
private List<Box> getUnderlineBBoxes(CleanRulings cleanRulings, List<Box> characterBBoxes) {
|
|
||||||
|
|
||||||
return cleanRulings.getHorizontal()
|
|
||||||
.stream()
|
.stream()
|
||||||
|
.filter(ruling -> !ruling.getClassification().equals(Ruling.Classification.OTHER))
|
||||||
.map(h -> new Box(h.x1, h.y1, h.x2, h.y2))
|
.map(h -> new Box(h.x1, h.y1, h.x2, h.y2))
|
||||||
.filter(box -> box.intersectsAnyAndOver(characterBBoxes, 6))
|
|
||||||
.collect(Collectors.toList());
|
.collect(Collectors.toList());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
private List<Box> getStrikeThroughBBoxes(CleanRulings cleanRulings, List<Box> characterBBoxes) {
|
|
||||||
|
|
||||||
return cleanRulings.getHorizontal().stream().map(h -> new Box(h.x1, h.y1, h.x2, h.y2)).filter(box -> box.intersectsCenter(characterBBoxes, 2)).collect(Collectors.toList());
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|||||||
@ -82,7 +82,6 @@ public class LegacyPDFStreamEngine extends PDFStreamEngine {
|
|||||||
|
|
||||||
private int pageRotation;
|
private int pageRotation;
|
||||||
private PDRectangle pageSize;
|
private PDRectangle pageSize;
|
||||||
private Matrix translateMatrix;
|
|
||||||
private final GlyphList glyphList;
|
private final GlyphList glyphList;
|
||||||
private final Map<COSDictionary, Float> fontHeightMap = new WeakHashMap<COSDictionary, Float>();
|
private final Map<COSDictionary, Float> fontHeightMap = new WeakHashMap<COSDictionary, Float>();
|
||||||
|
|
||||||
@ -134,12 +133,6 @@ public class LegacyPDFStreamEngine extends PDFStreamEngine {
|
|||||||
this.pageRotation = page.getRotation();
|
this.pageRotation = page.getRotation();
|
||||||
this.pageSize = page.getCropBox();
|
this.pageSize = page.getCropBox();
|
||||||
|
|
||||||
if (pageSize.getLowerLeftX() == 0 && pageSize.getLowerLeftY() == 0) {
|
|
||||||
translateMatrix = null;
|
|
||||||
} else {
|
|
||||||
// translation matrix for cropbox
|
|
||||||
translateMatrix = Matrix.getTranslateInstance(-pageSize.getLowerLeftX(), -pageSize.getLowerLeftY());
|
|
||||||
}
|
|
||||||
super.processPage(page);
|
super.processPage(page);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -265,62 +258,52 @@ public class LegacyPDFStreamEngine extends PDFStreamEngine {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// adjust for cropbox if needed
|
|
||||||
Matrix translatedTextRenderingMatrix;
|
|
||||||
if (translateMatrix == null) {
|
|
||||||
translatedTextRenderingMatrix = textRenderingMatrix;
|
|
||||||
} else {
|
|
||||||
translatedTextRenderingMatrix = Matrix.concatenate(translateMatrix, textRenderingMatrix);
|
|
||||||
nextX -= pageSize.getLowerLeftX();
|
|
||||||
nextY -= pageSize.getLowerLeftY();
|
|
||||||
}
|
|
||||||
|
|
||||||
// This is a hack for unicode letter with 2 chars e.g. RA see unicodeProblem.pdf
|
// This is a hack for unicode letter with 2 chars e.g. RA see unicodeProblem.pdf
|
||||||
if (unicodeMapping.length() == 2) {
|
if (unicodeMapping.length() == 2) {
|
||||||
processTextPosition(new TextPosition(pageRotation,
|
processTextPosition(new TextPosition(pageRotation,
|
||||||
pageSize.getWidth(),
|
pageSize.getWidth(),
|
||||||
pageSize.getHeight(),
|
pageSize.getHeight(),
|
||||||
translatedTextRenderingMatrix,
|
textRenderingMatrix,
|
||||||
nextX,
|
nextX,
|
||||||
nextY,
|
nextY,
|
||||||
Math.abs(dyDisplay),
|
Math.abs(dyDisplay),
|
||||||
dxDisplay,
|
dxDisplay,
|
||||||
Math.abs(spaceWidthDisplay),
|
Math.abs(spaceWidthDisplay),
|
||||||
Character.toString(unicodeMapping.charAt(0)),
|
Character.toString(unicodeMapping.charAt(0)),
|
||||||
new int[]{code},
|
new int[]{code},
|
||||||
font,
|
font,
|
||||||
fontSize,
|
fontSize,
|
||||||
(int) (fontSize * textMatrix.getScalingFactorX())));
|
(int) (fontSize * textMatrix.getScalingFactorX())));
|
||||||
processTextPosition(new TextPosition(pageRotation,
|
processTextPosition(new TextPosition(pageRotation,
|
||||||
pageSize.getWidth(),
|
pageSize.getWidth(),
|
||||||
pageSize.getHeight(),
|
pageSize.getHeight(),
|
||||||
translatedTextRenderingMatrix,
|
textRenderingMatrix,
|
||||||
nextX,
|
nextX,
|
||||||
nextY,
|
nextY,
|
||||||
Math.abs(dyDisplay),
|
Math.abs(dyDisplay),
|
||||||
dxDisplay,
|
dxDisplay,
|
||||||
Math.abs(spaceWidthDisplay),
|
Math.abs(spaceWidthDisplay),
|
||||||
Character.toString(unicodeMapping.charAt(1)),
|
Character.toString(unicodeMapping.charAt(1)),
|
||||||
new int[]{code},
|
new int[]{code},
|
||||||
font,
|
font,
|
||||||
fontSize,
|
fontSize,
|
||||||
(int) (fontSize * textMatrix.getScalingFactorX())));
|
(int) (fontSize * textMatrix.getScalingFactorX())));
|
||||||
} else {
|
} else {
|
||||||
|
|
||||||
processTextPosition(new TextPosition(pageRotation,
|
processTextPosition(new TextPosition(pageRotation,
|
||||||
pageSize.getWidth(),
|
pageSize.getWidth(),
|
||||||
pageSize.getHeight(),
|
pageSize.getHeight(),
|
||||||
translatedTextRenderingMatrix,
|
textRenderingMatrix,
|
||||||
nextX,
|
nextX,
|
||||||
nextY,
|
nextY,
|
||||||
Math.abs(dyDisplay),
|
Math.abs(dyDisplay),
|
||||||
dxDisplay,
|
dxDisplay,
|
||||||
Math.abs(spaceWidthDisplay),
|
Math.abs(spaceWidthDisplay),
|
||||||
unicodeMapping,
|
unicodeMapping,
|
||||||
new int[]{code},
|
new int[]{code},
|
||||||
font,
|
font,
|
||||||
fontSize,
|
fontSize,
|
||||||
(int) (fontSize * textMatrix.getScalingFactorX())));
|
(int) (fontSize * textMatrix.getScalingFactorX())));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@ -1007,7 +1007,7 @@ public class PDFTextStripper extends LegacyPDFStreamEngine {
|
|||||||
|
|
||||||
/**
|
/**
|
||||||
* Set the desired word separator for output text. The PDFBox text extraction algorithm will output a space
|
* Set the desired word separator for output text. The PDFBox text extraction algorithm will output a space
|
||||||
* character if there is enough space between two words. By default a space character is used. If you need and
|
* character if there is enough space between two textPositions. By default a space character is used. If you need and
|
||||||
* accurate count of characters that are found in a PDF document then you might want to set the word separator to
|
* accurate count of characters that are found in a PDF document then you might want to set the word separator to
|
||||||
* the empty string.
|
* the empty string.
|
||||||
*
|
*
|
||||||
@ -1703,7 +1703,7 @@ public class PDFTextStripper extends LegacyPDFStreamEngine {
|
|||||||
/**
|
/**
|
||||||
* Write a list of string containing a whole line of a document.
|
* Write a list of string containing a whole line of a document.
|
||||||
*
|
*
|
||||||
* @param line a list with the words of the given line
|
* @param line a list with the textPositions of the given line
|
||||||
* @throws IOException if something went wrong
|
* @throws IOException if something went wrong
|
||||||
*/
|
*/
|
||||||
private void writeLine(List<WordWithTextPositions> line, boolean isParagraphEnd) throws IOException {
|
private void writeLine(List<WordWithTextPositions> line, boolean isParagraphEnd) throws IOException {
|
||||||
@ -1744,9 +1744,9 @@ public class PDFTextStripper extends LegacyPDFStreamEngine {
|
|||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Handles the LTR and RTL direction of the given words. The whole implementation stands and falls with the given
|
* Handles the LTR and RTL direction of the given textPositions. The whole implementation stands and falls with the given
|
||||||
* word. If the word is a full line, the results will be the best. If the word contains of single words or
|
* word. If the word is a full line, the results will be the best. If the word contains of single textPositions or
|
||||||
* characters, the order of the characters in a word or words in a line may wrong, due to RTL and LTR marks and
|
* characters, the order of the characters in a word or textPositions in a line may wrong, due to RTL and LTR marks and
|
||||||
* characters!
|
* characters!
|
||||||
* <p>
|
* <p>
|
||||||
* Based on http://www.nesterovsky-bros.com/weblog/2013/07/28/VisualToLogicalConversionInJava.aspx
|
* Based on http://www.nesterovsky-bros.com/weblog/2013/07/28/VisualToLogicalConversionInJava.aspx
|
||||||
|
|||||||
@ -65,12 +65,20 @@ public class LayoutGridService {
|
|||||||
|
|
||||||
@SneakyThrows
|
@SneakyThrows
|
||||||
@Observed(name = "ViewerDocumentService", contextualName = "create-viewer-document")
|
@Observed(name = "ViewerDocumentService", contextualName = "create-viewer-document")
|
||||||
public void addLayoutGrid(File originFile, Document document, File destinationFile, boolean layerVisibilityDefaultValue) {
|
public void addLayoutGrid(File originFile, Document document, File destinationFile, boolean layerVisibilityDefaultValue, boolean writeVisualLayoutParsingGrid) {
|
||||||
|
|
||||||
|
List<Visualizations> allVisualizations;
|
||||||
Visualizations layoutGrid = this.addLayoutGrid(document, layerVisibilityDefaultValue, false);
|
Visualizations layoutGrid = this.addLayoutGrid(document, layerVisibilityDefaultValue, false);
|
||||||
Visualizations visualLayoutGrid = this.addLayoutGrid(document, layerVisibilityDefaultValue, true);
|
if (writeVisualLayoutParsingGrid) {
|
||||||
|
Visualizations visualLayoutGrid = this.addLayoutGrid(document, layerVisibilityDefaultValue, true);
|
||||||
|
allVisualizations = Stream.concat(Stream.of(layoutGrid, visualLayoutGrid), document.getVisualizations().streamAll())
|
||||||
|
.toList();
|
||||||
|
} else {
|
||||||
|
allVisualizations = Stream.concat(Stream.of(layoutGrid), document.getVisualizations().streamAll())
|
||||||
|
.toList();
|
||||||
|
}
|
||||||
|
|
||||||
viewerDocumentService.addVisualizationsOnPage(originFile, destinationFile, List.of(layoutGrid, visualLayoutGrid));
|
viewerDocumentService.addVisualizationsOnPage(originFile, destinationFile, allVisualizations);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@ -130,7 +138,10 @@ public class LayoutGridService {
|
|||||||
}
|
}
|
||||||
for (Page page : table.getPages()) {
|
for (Page page : table.getPages()) {
|
||||||
|
|
||||||
Optional<Integer> optionalFirstRowOnPage = table.streamCol(0).filter(tableCell -> tableCell.isOnPage(page.getNumber())).map(TableCell::getRow).findFirst();
|
Optional<Integer> optionalFirstRowOnPage = table.streamCol(0)
|
||||||
|
.filter(tableCell -> tableCell.isOnPage(page.getNumber()))
|
||||||
|
.map(TableCell::getRow)
|
||||||
|
.findFirst();
|
||||||
if (optionalFirstRowOnPage.isEmpty()) {
|
if (optionalFirstRowOnPage.isEmpty()) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
@ -170,14 +181,17 @@ public class LayoutGridService {
|
|||||||
|
|
||||||
private static Stream<Rectangle2D> streamBBoxOfCellsOnPage(Stream<TableCell> table, Page page) {
|
private static Stream<Rectangle2D> streamBBoxOfCellsOnPage(Stream<TableCell> table, Page page) {
|
||||||
|
|
||||||
return table.filter(tableCell -> tableCell.isOnPage(page.getNumber())).map(TableCell::getBBox).map(bBoxMap -> bBoxMap.get(page));
|
return table.filter(tableCell -> tableCell.isOnPage(page.getNumber()))
|
||||||
|
.map(TableCell::getBBox)
|
||||||
|
.map(bBoxMap -> bBoxMap.get(page));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
private void addSection(SemanticNode semanticNode, LayoutGrid layoutGrid, Color color) {
|
private void addSection(SemanticNode semanticNode, LayoutGrid layoutGrid, Color color) {
|
||||||
|
|
||||||
Map<Page, Rectangle2D> bBoxMap = semanticNode.getBBox();
|
Map<Page, Rectangle2D> bBoxMap = semanticNode.getBBox();
|
||||||
List<SemanticNode> subSections = semanticNode.streamAllSubNodesOfType(NodeType.SECTION).toList();
|
List<SemanticNode> subSections = semanticNode.streamAllSubNodesOfType(NodeType.SECTION)
|
||||||
|
.toList();
|
||||||
Page firstPage = semanticNode.getFirstPage();
|
Page firstPage = semanticNode.getFirstPage();
|
||||||
String treeIdString = buildTreeIdString(semanticNode);
|
String treeIdString = buildTreeIdString(semanticNode);
|
||||||
if (!subSections.isEmpty()) {
|
if (!subSections.isEmpty()) {
|
||||||
@ -197,7 +211,10 @@ public class LayoutGridService {
|
|||||||
}
|
}
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
List<Page> pagesInOrder = bBoxMap.keySet().stream().sorted(Comparator.comparingInt(Page::getNumber)).collect(Collectors.toList());
|
List<Page> pagesInOrder = bBoxMap.keySet()
|
||||||
|
.stream()
|
||||||
|
.sorted(Comparator.comparingInt(Page::getNumber))
|
||||||
|
.collect(Collectors.toList());
|
||||||
pagesInOrder.remove(0);
|
pagesInOrder.remove(0);
|
||||||
addLinesForFirstPageOfSection(semanticNode, color, firstPage, layoutGrid);
|
addLinesForFirstPageOfSection(semanticNode, color, firstPage, layoutGrid);
|
||||||
var lastPage = pagesInOrder.remove(pagesInOrder.size() - 1);
|
var lastPage = pagesInOrder.remove(pagesInOrder.size() - 1);
|
||||||
@ -294,7 +311,10 @@ public class LayoutGridService {
|
|||||||
|
|
||||||
private String buildTreeIdString(SemanticNode semanticNode) {
|
private String buildTreeIdString(SemanticNode semanticNode) {
|
||||||
|
|
||||||
return semanticNode.getTreeId().stream().map(Object::toString).collect(Collectors.joining("."));
|
return semanticNode.getTreeId()
|
||||||
|
.stream()
|
||||||
|
.map(Object::toString)
|
||||||
|
.collect(Collectors.joining("."));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@ -0,0 +1,56 @@
|
|||||||
|
package com.knecon.fforesight.service.layoutparser.processor.utils;
|
||||||
|
|
||||||
|
import java.awt.geom.AffineTransform;
|
||||||
|
|
||||||
|
import lombok.SneakyThrows;
|
||||||
|
import lombok.experimental.UtilityClass;
|
||||||
|
|
||||||
|
@UtilityClass
|
||||||
|
public class CoordinateTransforms {
|
||||||
|
|
||||||
|
public AffineTransform calculateImageCoordsToInitialUserSpaceCoords(PageInformation pageInformation, double scalingFactor) {
|
||||||
|
|
||||||
|
AffineTransform imageToCropBoxScaling = new AffineTransform(scalingFactor, 0, 0, scalingFactor, -pageInformation.minX(), -pageInformation.minY());
|
||||||
|
|
||||||
|
AffineTransform mirrorMatrix = new AffineTransform(1, 0, 0, -1, 0, pageInformation.height());
|
||||||
|
|
||||||
|
AffineTransform rotationMatrix = switch (pageInformation.rotationDegrees()) {
|
||||||
|
case 90 -> new AffineTransform(0, 1, -1, 0, pageInformation.height(), 0);
|
||||||
|
case 180 -> new AffineTransform(-1, 0, 0, -1, pageInformation.width(), pageInformation.height());
|
||||||
|
case 270 -> new AffineTransform(0, -1, 1, 0, pageInformation.width() - pageInformation.height(), pageInformation.height()); // results from 90 + 180 rotations
|
||||||
|
default -> new AffineTransform();
|
||||||
|
};
|
||||||
|
|
||||||
|
// matrix multiplication is performed from right to left, so the order is reversed.
|
||||||
|
// scaling -> mirror -> rotation
|
||||||
|
AffineTransform resultMatrix = new AffineTransform();
|
||||||
|
|
||||||
|
resultMatrix.concatenate(rotationMatrix);
|
||||||
|
resultMatrix.concatenate(mirrorMatrix);
|
||||||
|
resultMatrix.concatenate(imageToCropBoxScaling);
|
||||||
|
return resultMatrix;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
@SneakyThrows
|
||||||
|
public AffineTransform calculateInitialUserSpaceCoordsToImageCoords(PageInformation pageInformation, double scalingFactor) {
|
||||||
|
|
||||||
|
return calculateImageCoordsToInitialUserSpaceCoords(pageInformation, scalingFactor).createInverse();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public double calculateScalingFactor(PageInformation pageInformation, double imageWidth) {
|
||||||
|
|
||||||
|
// PDFBox always returns page height and width based on rotation
|
||||||
|
double pageWidth;
|
||||||
|
if (pageInformation.rotationDegrees() == 90 || pageInformation.rotationDegrees() == 270) {
|
||||||
|
pageWidth = pageInformation.height();
|
||||||
|
} else {
|
||||||
|
pageWidth = pageInformation.width();
|
||||||
|
}
|
||||||
|
|
||||||
|
return pageWidth / imageWidth;
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
@ -1,10 +1,10 @@
|
|||||||
package com.knecon.fforesight.service.layoutparser.processor.utils;
|
package com.knecon.fforesight.service.layoutparser.processor.utils;
|
||||||
|
|
||||||
import java.awt.geom.Point2D;
|
import java.awt.geom.Point2D;
|
||||||
|
import java.awt.geom.Rectangle2D;
|
||||||
import java.util.Comparator;
|
import java.util.Comparator;
|
||||||
|
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.Cell;
|
import com.knecon.fforesight.service.layoutparser.processor.model.table.Cell;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.Rectangle;
|
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling;
|
import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling;
|
||||||
|
|
||||||
public class GeometricComparators {
|
public class GeometricComparators {
|
||||||
@ -58,7 +58,7 @@ public class GeometricComparators {
|
|||||||
return cell1Size.compareTo(cell2Size);
|
return cell1Size.compareTo(cell2Size);
|
||||||
};
|
};
|
||||||
|
|
||||||
public static final Comparator<Rectangle> RECTANGLE_SIZE_COMPARATOR = (rect1, rect2) -> {
|
public static final Comparator<Rectangle2D> RECTANGLE_SIZE_COMPARATOR = (rect1, rect2) -> {
|
||||||
|
|
||||||
Double rect1Size = rect1.getHeight() * rect1.getWidth();
|
Double rect1Size = rect1.getHeight() * rect1.getWidth();
|
||||||
Double rect2Size = rect2.getHeight() * rect2.getWidth();
|
Double rect2Size = rect2.getHeight() * rect2.getWidth();
|
||||||
|
|||||||
@ -0,0 +1,223 @@
|
|||||||
|
package com.knecon.fforesight.service.layoutparser.processor.utils;
|
||||||
|
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.HashMap;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.Map;
|
||||||
|
import java.util.stream.Collectors;
|
||||||
|
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationDocument;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
|
||||||
|
|
||||||
|
import lombok.experimental.UtilityClass;
|
||||||
|
|
||||||
|
@UtilityClass
|
||||||
|
public class HeaderFooterDetection {
|
||||||
|
|
||||||
|
private final Map<Integer, ClassificationPage> pagesCache = new HashMap<>();
|
||||||
|
private static final double THRESHOLD = 0.5;
|
||||||
|
// Weight will go from 1.0 to 0.5 because the first element is the most likely to be the header on the page.
|
||||||
|
private static final double[] headerWeights = {1.0, 0.75, 0.5};
|
||||||
|
// Weight will go from 0.5 to 1.0 because the last element is the most likely to be the footer on the page.
|
||||||
|
private static final double[] footerWeights = {0.5, 0.75, 1.0};
|
||||||
|
|
||||||
|
|
||||||
|
public boolean isLikelyFooter(TextPageBlock textPageBlock, ClassificationDocument document, ClassificationPage classificationPage) {
|
||||||
|
|
||||||
|
int numberOfPages = document.getPages().size();
|
||||||
|
if (numberOfPages < 3) {
|
||||||
|
// If the document has 1 or 2 pages this may lead to more false positives than actual findings.
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
int window = Math.min(numberOfPages, 8);
|
||||||
|
|
||||||
|
List<ClassificationPage> nearestPages = findNearestPages(classificationPage, document.getPages(), window);
|
||||||
|
List<List<AbstractPageBlock>> footerCandidates = getFooterCandidates(nearestPages);
|
||||||
|
|
||||||
|
return detectHeadersOrFootersByPageAssociation(textPageBlock.getText(), footerCandidates, window, footerWeights);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public boolean isLikelyHeader(TextPageBlock textPageBlock, ClassificationDocument document, ClassificationPage classificationPage) {
|
||||||
|
|
||||||
|
int numberOfPages = document.getPages().size();
|
||||||
|
if (numberOfPages < 3) {
|
||||||
|
// If the document has 1 or 2 pages this may lead to more false positives than actual findings.
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
int window = Math.min(numberOfPages, 8);
|
||||||
|
|
||||||
|
List<ClassificationPage> nearestPages = findNearestPages(classificationPage, document.getPages(), window);
|
||||||
|
List<List<AbstractPageBlock>> headerCandidates = getHeaderCandidates(nearestPages);
|
||||||
|
|
||||||
|
return detectHeadersOrFootersByPageAssociation(textPageBlock.getText(), headerCandidates, window, headerWeights);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private boolean detectHeadersOrFootersByPageAssociation(String testString, List<List<AbstractPageBlock>> candidates, int window, double[] weights) {
|
||||||
|
|
||||||
|
double highestScore = 0.0;
|
||||||
|
|
||||||
|
for (int i = 0; i < candidates.size(); i++) {
|
||||||
|
List<List<String>> candidateStrings = new ArrayList<>();
|
||||||
|
for (int k = Math.max(i - window, 0); k < Math.min(i + window, candidates.size()); k++) {
|
||||||
|
candidateStrings.add(candidates.get(k)
|
||||||
|
.stream()
|
||||||
|
.map(AbstractPageBlock::getText)
|
||||||
|
.collect(Collectors.toList()));
|
||||||
|
}
|
||||||
|
|
||||||
|
int maxLen = candidateStrings.stream()
|
||||||
|
.mapToInt(List::size)
|
||||||
|
.max()
|
||||||
|
.orElse(0);
|
||||||
|
for (List<String> sublist : candidateStrings) {
|
||||||
|
while (sublist.size() < maxLen) {
|
||||||
|
sublist.add(0, "");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Compare the testString against each candidate in the window
|
||||||
|
for (int j = 0; j < maxLen; j++) {
|
||||||
|
double score = 0.0;
|
||||||
|
int finalJ = j;
|
||||||
|
List<String> paddedCandidateStrings = candidateStrings.stream()
|
||||||
|
.map(sublist -> sublist.size() > finalJ ? sublist.get(finalJ) : "")
|
||||||
|
.toList();
|
||||||
|
for (String paddedString : paddedCandidateStrings) {
|
||||||
|
if ((testString.length() >= 5 && paddedString.length() >= 5) && (testString.length() > 2 * paddedString.length()
|
||||||
|
|| paddedString.length() > 2 * testString.length())) {
|
||||||
|
// If both strings are at least 5 characters long and one string is more than twice as long as the other,
|
||||||
|
// skip the distance calculation as it's time-consuming, and we can assume they are not similar enough
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
int distance = calculateHammingDistanceWithPreprocessing(testString, paddedString);
|
||||||
|
double normalizedScore = 1 - (double) distance / Math.max(testString.length(), paddedString.length());
|
||||||
|
score += normalizedScore * (j < weights.length ? weights[j] : 1);
|
||||||
|
}
|
||||||
|
score /= paddedCandidateStrings.size();
|
||||||
|
highestScore = Math.max(highestScore, score);
|
||||||
|
// Early stop
|
||||||
|
if (highestScore > THRESHOLD) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Find the nearest n pages for a given page.
|
||||||
|
* For example: nearest 8 pages for page 4 are: 1, 2, 3, 5, 6, 7, 8, 9.
|
||||||
|
*
|
||||||
|
* @param currentPage Current page to find the nearest ones.
|
||||||
|
* @param allPages All pages in the document.
|
||||||
|
* @param numNeighbors Number of neighbouring pages to find.
|
||||||
|
* @return The nearest pages.
|
||||||
|
*/
|
||||||
|
private List<ClassificationPage> findNearestPages(ClassificationPage currentPage, List<ClassificationPage> allPages, int numNeighbors) {
|
||||||
|
|
||||||
|
int totalPages = allPages.size();
|
||||||
|
List<ClassificationPage> nearestPages = new ArrayList<>();
|
||||||
|
|
||||||
|
int currentPageIndex = currentPage.getPageNumber() - 1;
|
||||||
|
int halfWin = numNeighbors / 2;
|
||||||
|
int start = Math.max(0, currentPageIndex - halfWin);
|
||||||
|
int end = Math.min(totalPages - 1, currentPageIndex + halfWin);
|
||||||
|
|
||||||
|
for (int i = start; i <= end; i++) {
|
||||||
|
if (i != currentPageIndex) {
|
||||||
|
nearestPages.add(pagesCache.computeIfAbsent(i, allPages::get));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pagesCache.keySet().removeIf(key -> key < start || key > end);
|
||||||
|
|
||||||
|
return nearestPages;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
// Get the last 3 TextBlocks on the page as they are likely to be a footer
|
||||||
|
private List<List<AbstractPageBlock>> getFooterCandidates(List<ClassificationPage> pages) {
|
||||||
|
|
||||||
|
List<List<AbstractPageBlock>> footerCandidates = new ArrayList<>();
|
||||||
|
for (ClassificationPage page : pages) {
|
||||||
|
List<AbstractPageBlock> textBlocks = page.getTextBlocks();
|
||||||
|
List<TextPageBlock> textPageBlocks = textBlocks.stream()
|
||||||
|
.filter(textBlock -> textBlock instanceof TextPageBlock)
|
||||||
|
.map(textBlock -> (TextPageBlock) textBlock)
|
||||||
|
.toList();
|
||||||
|
int blockCount = textPageBlocks.size();
|
||||||
|
if (blockCount > 0) {
|
||||||
|
int start = Math.max(0, blockCount - 3);
|
||||||
|
footerCandidates.add(new ArrayList<>(textPageBlocks.subList(start, blockCount)));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return footerCandidates;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
// Get the first 3 TextBlocks on the page as they are likely to be a header
|
||||||
|
private List<List<AbstractPageBlock>> getHeaderCandidates(List<ClassificationPage> pages) {
|
||||||
|
|
||||||
|
List<List<AbstractPageBlock>> headerCandidates = new ArrayList<>();
|
||||||
|
for (ClassificationPage page : pages) {
|
||||||
|
List<AbstractPageBlock> textBlocks = page.getTextBlocks();
|
||||||
|
List<TextPageBlock> textPageBlocks = textBlocks.stream()
|
||||||
|
.filter(textBlock -> textBlock instanceof TextPageBlock)
|
||||||
|
.map(textBlock -> (TextPageBlock) textBlock)
|
||||||
|
.toList();
|
||||||
|
int count = Math.min(3, textPageBlocks.size());
|
||||||
|
headerCandidates.add(new ArrayList<>(textPageBlocks.subList(0, count)));
|
||||||
|
}
|
||||||
|
return headerCandidates;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Calculate the Hamming distance between two strings after preprocessing to make them the same length
|
||||||
|
* and replacing all digits with a special character '@' since they are a common occurrence in headers/footers.
|
||||||
|
*
|
||||||
|
* @param firstCandidate First string
|
||||||
|
* @param secondCandidate Second string
|
||||||
|
* @return The Hamming distance between the two preprocessed strings.
|
||||||
|
*/
|
||||||
|
private int calculateHammingDistanceWithPreprocessing(String firstCandidate, String secondCandidate) {
|
||||||
|
|
||||||
|
int maxLength = Math.max(firstCandidate.length(), secondCandidate.length());
|
||||||
|
|
||||||
|
String cleanFirstCandidate = padString(firstCandidate, maxLength, '\0').replaceAll("\\d", "@");
|
||||||
|
String cleanSecondCandidate = padString(secondCandidate, maxLength, '\0').replaceAll("\\d", "@");
|
||||||
|
|
||||||
|
int distance = 0;
|
||||||
|
for (int i = 0; i < maxLength; i++) {
|
||||||
|
if (cleanFirstCandidate.charAt(i) != cleanSecondCandidate.charAt(i)) {
|
||||||
|
distance++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return distance;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private String padString(String input, int length, char padChar) {
|
||||||
|
|
||||||
|
if (input.length() >= length) {
|
||||||
|
return input;
|
||||||
|
}
|
||||||
|
|
||||||
|
StringBuilder sb = new StringBuilder(input);
|
||||||
|
|
||||||
|
while (sb.length() < length) {
|
||||||
|
sb.append(padChar);
|
||||||
|
}
|
||||||
|
return sb.toString();
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
@ -1,12 +1,5 @@
|
|||||||
package com.knecon.fforesight.service.layoutparser.processor.utils;
|
package com.knecon.fforesight.service.layoutparser.processor.utils;
|
||||||
|
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
|
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
|
|
||||||
import lombok.experimental.UtilityClass;
|
|
||||||
import org.apache.pdfbox.cos.COSName;
|
|
||||||
import org.apache.pdfbox.pdmodel.documentinterchange.markedcontent.PDMarkedContent;
|
|
||||||
import org.apache.pdfbox.text.TextPosition;
|
|
||||||
|
|
||||||
import java.awt.geom.Rectangle2D;
|
import java.awt.geom.Rectangle2D;
|
||||||
import java.util.Collection;
|
import java.util.Collection;
|
||||||
import java.util.Collections;
|
import java.util.Collections;
|
||||||
@ -14,12 +7,23 @@ import java.util.List;
|
|||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
import java.util.stream.Collectors;
|
import java.util.stream.Collectors;
|
||||||
|
|
||||||
|
import org.apache.pdfbox.cos.COSName;
|
||||||
|
import org.apache.pdfbox.pdmodel.documentinterchange.markedcontent.PDMarkedContent;
|
||||||
|
import org.apache.pdfbox.text.TextPosition;
|
||||||
|
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.BoundingBox;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
|
||||||
|
|
||||||
|
import lombok.experimental.UtilityClass;
|
||||||
|
|
||||||
@UtilityClass
|
@UtilityClass
|
||||||
public class MarkedContentUtils {
|
public class MarkedContentUtils {
|
||||||
|
|
||||||
public static final String HEADER = "Header";
|
public static final String HEADER = "Header";
|
||||||
public static final String FOOTER = "Footer";
|
public static final String FOOTER = "Footer";
|
||||||
|
|
||||||
|
|
||||||
public List<Rectangle2D> getMarkedContentBboxPerLine(List<PDMarkedContent> markedContents, String subtype) {
|
public List<Rectangle2D> getMarkedContentBboxPerLine(List<PDMarkedContent> markedContents, String subtype) {
|
||||||
|
|
||||||
if (markedContents == null) {
|
if (markedContents == null) {
|
||||||
@ -31,7 +35,8 @@ public class MarkedContentUtils {
|
|||||||
.filter(m -> m.getProperties() != null)
|
.filter(m -> m.getProperties() != null)
|
||||||
.filter(m -> m.getProperties().getItem("Subtype") != null)
|
.filter(m -> m.getProperties().getItem("Subtype") != null)
|
||||||
.filter(m -> ((COSName) m.getProperties().getItem("Subtype")).getName().equals(subtype))
|
.filter(m -> ((COSName) m.getProperties().getItem("Subtype")).getName().equals(subtype))
|
||||||
.map(PDMarkedContent::getContents).flatMap(Collection::stream)
|
.map(PDMarkedContent::getContents)
|
||||||
|
.flatMap(Collection::stream)
|
||||||
.filter(t -> t instanceof TextPosition)
|
.filter(t -> t instanceof TextPosition)
|
||||||
.map(t -> (TextPosition) t)
|
.map(t -> (TextPosition) t)
|
||||||
.filter(t -> !t.getUnicode().equals(" "))
|
.filter(t -> !t.getUnicode().equals(" "))
|
||||||
@ -41,16 +46,77 @@ public class MarkedContentUtils {
|
|||||||
return Collections.emptyList();
|
return Collections.emptyList();
|
||||||
}
|
}
|
||||||
|
|
||||||
return markedContentByYPosition.values().stream()
|
return markedContentByYPosition.values()
|
||||||
.map(textPositions -> new TextPositionSequence(textPositions.stream()
|
.stream()
|
||||||
.toList(), 0, true)
|
.map(textPositions -> new TextPositionSequence(textPositions, 0, true).getBBoxInitialUserSpace())
|
||||||
.getRectangle())
|
.map(t -> new Rectangle2D.Double(t.getX(), t.getY() - Math.abs(t.getHeight()), t.getWidth(), Math.abs(t.getHeight())))
|
||||||
.map(t -> new Rectangle2D.Float(t.getTopLeft().getX(), t.getTopLeft().getY() - Math.abs(t.getHeight()), t.getWidth(), Math.abs(t.getHeight()))).collect(Collectors.toList());
|
.collect(Collectors.toList());
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public List<MarkedContentPosition> getMarkedContentPositions(List<PDMarkedContent> markedContents) {
|
||||||
|
|
||||||
|
if (markedContents == null) {
|
||||||
|
return Collections.emptyList();
|
||||||
|
}
|
||||||
|
|
||||||
|
return markedContents.stream()
|
||||||
|
.filter(m -> !m.getContents().isEmpty())
|
||||||
|
.map(MarkedContentPosition::fromPDMarkedContent)
|
||||||
|
.toList();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
public boolean intersects(TextPageBlock textBlock, Map<String, List<Rectangle2D>> markedContentBboxPerType, String type) {
|
public boolean intersects(TextPageBlock textBlock, Map<String, List<Rectangle2D>> markedContentBboxPerType, String type) {
|
||||||
return markedContentBboxPerType.get(type) != null && markedContentBboxPerType.get(type).stream().anyMatch(rectangle -> rectangle.intersects(textBlock.getPdfMinX(), textBlock.getPdfMinY(), textBlock.getWidth(), textBlock.getHeight()));
|
|
||||||
|
return markedContentBboxPerType.get(type) != null && markedContentBboxPerType.get(type)
|
||||||
|
.stream()
|
||||||
|
.anyMatch(rectangle -> rectangle.intersects(textBlock.getPdfMinX(), textBlock.getPdfMinY(), textBlock.getWidth(), textBlock.getHeight()));
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public record MarkedContentPosition(String type, String subType, List<Rectangle2D> textPositions) {
|
||||||
|
|
||||||
|
public static MarkedContentPosition fromPDMarkedContent(PDMarkedContent markedContent) {
|
||||||
|
|
||||||
|
return new MarkedContentPosition(markedContent.getTag(), parseSubType(markedContent), parseTextPositions(markedContent.getContents()));
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private static List<Rectangle2D> parseTextPositions(List<Object> contents) {
|
||||||
|
|
||||||
|
return contents.stream()
|
||||||
|
.filter(content -> content instanceof TextPosition)
|
||||||
|
.map(content -> (TextPosition) content)
|
||||||
|
.filter(content -> !content.getUnicode().equals(" "))
|
||||||
|
.map(textPositions -> new TextPositionSequence(List.of(textPositions), 0, true))
|
||||||
|
.map(BoundingBox::getBBoxInitialUserSpace)
|
||||||
|
.collect(Collectors.toList());
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private static String parseSubType(PDMarkedContent markedContent) {
|
||||||
|
|
||||||
|
if (markedContent == null || markedContent.getProperties() == null || markedContent.getProperties().getItem("Subtype") == null) {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
return ((COSName) markedContent.getProperties().getItem("Subtype")).getName();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public String formattedType() {
|
||||||
|
|
||||||
|
if (subType == null || subType.isEmpty()) {
|
||||||
|
return type;
|
||||||
|
}
|
||||||
|
if (type.equals("Artifact")) {
|
||||||
|
return subType;
|
||||||
|
}
|
||||||
|
return String.format("%s-%s", type, subType);
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|||||||
@ -1,4 +1,4 @@
|
|||||||
package com.knecon.fforesight.service.layoutparser.processor.services.graphics;
|
package com.knecon.fforesight.service.layoutparser.processor.utils;
|
||||||
|
|
||||||
import java.awt.geom.Rectangle2D;
|
import java.awt.geom.Rectangle2D;
|
||||||
|
|
||||||
@ -22,6 +22,15 @@ public record PageInformation(Rectangle2D mediabox, int number, int rotationDegr
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public double heightRot() {
|
||||||
|
|
||||||
|
if (rotationDegrees == 90 || rotationDegrees == 270) {
|
||||||
|
return width();
|
||||||
|
}
|
||||||
|
return height();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
public double width() {
|
public double width() {
|
||||||
|
|
||||||
return mediabox.getWidth();
|
return mediabox.getWidth();
|
||||||
@ -114,7 +114,7 @@ public final class PositionUtils {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
public Float getApproxLineCount(TextPageBlock textBlock) {
|
public double getApproxLineCount(TextPageBlock textBlock) {
|
||||||
|
|
||||||
return textBlock.getHeight() / textBlock.getMostPopularWordHeight();
|
return textBlock.getHeight() / textBlock.getMostPopularWordHeight();
|
||||||
}
|
}
|
||||||
|
|||||||
@ -52,7 +52,10 @@ public class RectangleTransformations {
|
|||||||
|
|
||||||
public static Rectangle2D bBoxUnionAtomicTextBlock(List<AtomicTextBlock> atomicTextBlocks) {
|
public static Rectangle2D bBoxUnionAtomicTextBlock(List<AtomicTextBlock> atomicTextBlocks) {
|
||||||
|
|
||||||
return atomicTextBlocks.stream().flatMap(atomicTextBlock -> atomicTextBlock.getPositions().stream()).collect(new Rectangle2DBBoxCollector());
|
return atomicTextBlocks.stream()
|
||||||
|
.flatMap(atomicTextBlock -> atomicTextBlock.getPositions()
|
||||||
|
.stream())
|
||||||
|
.collect(new Rectangle2DBBoxCollector());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@ -77,7 +80,10 @@ public class RectangleTransformations {
|
|||||||
|
|
||||||
public static Rectangle2D atomicTextBlockBBox(List<AtomicTextBlock> atomicTextBlocks) {
|
public static Rectangle2D atomicTextBlockBBox(List<AtomicTextBlock> atomicTextBlocks) {
|
||||||
|
|
||||||
return atomicTextBlocks.stream().flatMap(atomicTextBlock -> atomicTextBlock.getPositions().stream()).collect(new Rectangle2DBBoxCollector());
|
return atomicTextBlocks.stream()
|
||||||
|
.flatMap(atomicTextBlock -> atomicTextBlock.getPositions()
|
||||||
|
.stream())
|
||||||
|
.collect(new Rectangle2DBBoxCollector());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@ -89,16 +95,18 @@ public class RectangleTransformations {
|
|||||||
|
|
||||||
public static Rectangle2D rectangleBBox(List<Rectangle> rectangles) {
|
public static Rectangle2D rectangleBBox(List<Rectangle> rectangles) {
|
||||||
|
|
||||||
return rectangles.stream().map(RectangleTransformations::toRectangle2D).collect(new Rectangle2DBBoxCollector());
|
return rectangles.stream()
|
||||||
|
.map(RectangleTransformations::toRectangle2D)
|
||||||
|
.collect(new Rectangle2DBBoxCollector());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
public static Rectangle2D toRectangle2D(Rectangle redactionLogRectangle) {
|
public static Rectangle2D toRectangle2D(Rectangle redactionLogRectangle) {
|
||||||
|
|
||||||
return new Rectangle2D.Double(redactionLogRectangle.getTopLeft().getX(),
|
return new Rectangle2D.Double(redactionLogRectangle.getTopLeft().getX(),
|
||||||
redactionLogRectangle.getTopLeft().getY() + redactionLogRectangle.getHeight(),
|
redactionLogRectangle.getTopLeft().getY() + redactionLogRectangle.getHeight(),
|
||||||
redactionLogRectangle.getWidth(),
|
redactionLogRectangle.getWidth(),
|
||||||
-redactionLogRectangle.getHeight());
|
-redactionLogRectangle.getHeight());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@ -111,15 +119,16 @@ public class RectangleTransformations {
|
|||||||
public static Rectangle toRedactionLogRectangle(Rectangle2D rectangle2D, int pageNumber) {
|
public static Rectangle toRedactionLogRectangle(Rectangle2D rectangle2D, int pageNumber) {
|
||||||
|
|
||||||
return new Rectangle(new Point((float) rectangle2D.getMinX(), (float) (rectangle2D.getMinY() + rectangle2D.getHeight())),
|
return new Rectangle(new Point((float) rectangle2D.getMinX(), (float) (rectangle2D.getMinY() + rectangle2D.getHeight())),
|
||||||
(float) rectangle2D.getWidth(),
|
(float) rectangle2D.getWidth(),
|
||||||
-(float) rectangle2D.getHeight(),
|
-(float) rectangle2D.getHeight(),
|
||||||
pageNumber);
|
pageNumber);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
public static Rectangle2D rectangle2DBBox(List<Rectangle2D> rectangle2DList) {
|
public static Rectangle2D rectangle2DBBox(List<Rectangle2D> rectangle2DList) {
|
||||||
|
|
||||||
return rectangle2DList.stream().collect(new Rectangle2DBBoxCollector());
|
return rectangle2DList.stream()
|
||||||
|
.collect(new Rectangle2DBBoxCollector());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@ -134,7 +143,8 @@ public class RectangleTransformations {
|
|||||||
if (rectangle2DList.isEmpty()) {
|
if (rectangle2DList.isEmpty()) {
|
||||||
return Collections.emptyList();
|
return Collections.emptyList();
|
||||||
}
|
}
|
||||||
double splitThreshold = rectangle2DList.stream().mapToDouble(RectangularShape::getWidth).average().orElse(5) * 5.0;
|
double splitThreshold = rectangle2DList.stream()
|
||||||
|
.mapToDouble(RectangularShape::getWidth).average().orElse(5) * 5.0;
|
||||||
|
|
||||||
List<List<Rectangle2D>> rectangleListsWithGaps = new LinkedList<>();
|
List<List<Rectangle2D>> rectangleListsWithGaps = new LinkedList<>();
|
||||||
List<Rectangle2D> rectangleListWithoutGaps = new LinkedList<>();
|
List<Rectangle2D> rectangleListWithoutGaps = new LinkedList<>();
|
||||||
@ -171,7 +181,7 @@ public class RectangleTransformations {
|
|||||||
verticalRulings.add(new Ruling(new Point2D.Float(rectangle.x + rectangle.width, rectangle.y),
|
verticalRulings.add(new Ruling(new Point2D.Float(rectangle.x + rectangle.width, rectangle.y),
|
||||||
new Point2D.Float(rectangle.x + rectangle.width, rectangle.y + rectangle.height)));
|
new Point2D.Float(rectangle.x + rectangle.width, rectangle.y + rectangle.height)));
|
||||||
});
|
});
|
||||||
return CleanRulings.builder().vertical(verticalRulings).horizontal(horizontalRulings).build();
|
return new CleanRulings(verticalRulings, horizontalRulings);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@ -195,9 +205,9 @@ public class RectangleTransformations {
|
|||||||
public BinaryOperator<BBox> combiner() {
|
public BinaryOperator<BBox> combiner() {
|
||||||
|
|
||||||
return (b1, b2) -> new BBox(Math.min(b1.lowerLeftX, b2.lowerLeftX),
|
return (b1, b2) -> new BBox(Math.min(b1.lowerLeftX, b2.lowerLeftX),
|
||||||
Math.min(b1.lowerLeftY, b2.lowerLeftY),
|
Math.min(b1.lowerLeftY, b2.lowerLeftY),
|
||||||
Math.max(b1.upperRightX, b2.upperRightX),
|
Math.max(b1.upperRightX, b2.upperRightX),
|
||||||
Math.max(b1.upperRightY, b2.upperRightY));
|
Math.max(b1.upperRightY, b2.upperRightY));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@ -14,23 +14,24 @@ public class RectangularIntersectionFinder {
|
|||||||
|
|
||||||
public static List<Rectangle2D> find(List<Ruling> horizontalRulingLines, List<Ruling> verticalRulingLines) {
|
public static List<Rectangle2D> find(List<Ruling> horizontalRulingLines, List<Ruling> verticalRulingLines) {
|
||||||
|
|
||||||
// Fix for 211.pdf
|
// // Fix for 211.pdf
|
||||||
for (Ruling r : horizontalRulingLines) {
|
// for (Ruling r : horizontalRulingLines) {
|
||||||
if (r.getX2() < r.getX1()) {
|
// if (r.getX2() < r.getX1()) {
|
||||||
double a = r.getX2();
|
// double a = r.getX2();
|
||||||
r.x2 = (float) r.getX1();
|
// r.x2 = (float) r.getX1();
|
||||||
r.x1 = (float) a;
|
// r.x1 = (float) a;
|
||||||
}
|
// }
|
||||||
}
|
// }
|
||||||
|
|
||||||
List<Rectangle2D> foundRectangles = new ArrayList<>();
|
List<Rectangle2D> foundRectangles = new ArrayList<>();
|
||||||
Map<Point2D, Ruling[]> intersectionPoints = Ruling.findIntersections(horizontalRulingLines, verticalRulingLines);
|
Map<Point2D, RulingIntersectionFinder.IntersectingRulings> intersectionPoints = RulingIntersectionFinder.findNaive(horizontalRulingLines, verticalRulingLines);
|
||||||
|
|
||||||
List<Point2D> intersectionPointsList = new ArrayList<>(intersectionPoints.keySet());
|
List<Point2D> intersectionPointsList = new ArrayList<>(intersectionPoints.keySet());
|
||||||
intersectionPointsList.sort(Y_FIRST_POINT_COMPARATOR);
|
intersectionPointsList.sort(Y_FIRST_POINT_COMPARATOR);
|
||||||
|
|
||||||
for (int i = 0; i < intersectionPointsList.size(); i++) {
|
for (int i = 0; i < intersectionPointsList.size(); i++) {
|
||||||
Point2D topLeft = intersectionPointsList.get(i);
|
Point2D topLeft = intersectionPointsList.get(i);
|
||||||
Ruling[] hv = intersectionPoints.get(topLeft);
|
RulingIntersectionFinder.IntersectingRulings intersectingRulingsFromTopLeft = intersectionPoints.get(topLeft);
|
||||||
|
|
||||||
// CrossingPointsDirectlyBelow( topLeft );
|
// CrossingPointsDirectlyBelow( topLeft );
|
||||||
List<Point2D> xPoints = new ArrayList<>();
|
List<Point2D> xPoints = new ArrayList<>();
|
||||||
@ -48,19 +49,24 @@ public class RectangularIntersectionFinder {
|
|||||||
outer:
|
outer:
|
||||||
for (Point2D xPoint : xPoints) {
|
for (Point2D xPoint : xPoints) {
|
||||||
// is there a vertical edge b/w topLeft and xPoint?
|
// is there a vertical edge b/w topLeft and xPoint?
|
||||||
if (!hv[1].equals(intersectionPoints.get(xPoint)[1])) {
|
if (!intersectingRulingsFromTopLeft.vertical().equals(intersectionPoints.get(xPoint).vertical())) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
for (Point2D yPoint : yPoints) {
|
for (Point2D yPoint : yPoints) {
|
||||||
// is there a horizontal edge b/w topLeft and yPoint ?
|
// is there a horizontal edge b/w topLeft and yPoint ?
|
||||||
if (!hv[0].equals(intersectionPoints.get(yPoint)[0])) {
|
if (!intersectingRulingsFromTopLeft.horizontal().equals(intersectionPoints.get(yPoint).horizontal())) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
Point2D btmRight = new Point2D.Float((float) yPoint.getX(), (float) xPoint.getY());
|
Point2D btmRight = new Point2D.Float((float) yPoint.getX(), (float) xPoint.getY());
|
||||||
if (intersectionPoints.containsKey(btmRight)
|
if (intersectionPoints.containsKey(btmRight)
|
||||||
&& intersectionPoints.get(btmRight)[0].equals(intersectionPoints.get(xPoint)[0])
|
&& intersectionPoints.get(btmRight).horizontal().equals(intersectionPoints.get(xPoint).horizontal())
|
||||||
&& intersectionPoints.get(btmRight)[1].equals(intersectionPoints.get(yPoint)[1])) {
|
&& intersectionPoints.get(btmRight).vertical().equals(intersectionPoints.get(yPoint).vertical())) {
|
||||||
|
|
||||||
foundRectangles.add(new Rectangle2D.Double(topLeft.getX(), topLeft.getY(), btmRight.getX() - topLeft.getX(), btmRight.getY() - topLeft.getY()));
|
foundRectangles.add(new Rectangle2D.Double(topLeft.getX(), topLeft.getY(), btmRight.getX() - topLeft.getX(), btmRight.getY() - topLeft.getY()));
|
||||||
|
intersectionPoints.get(topLeft).horizontal().setClassification(Ruling.Classification.TABLE_LINE);
|
||||||
|
intersectionPoints.get(topLeft).vertical().setClassification(Ruling.Classification.TABLE_LINE);
|
||||||
|
intersectionPoints.get(btmRight).horizontal().setClassification(Ruling.Classification.TABLE_LINE);
|
||||||
|
intersectionPoints.get(btmRight).vertical().setClassification(Ruling.Classification.TABLE_LINE);
|
||||||
break outer;
|
break outer;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@ -0,0 +1,200 @@
|
|||||||
|
package com.knecon.fforesight.service.layoutparser.processor.utils;
|
||||||
|
|
||||||
|
import java.awt.geom.Point2D;
|
||||||
|
import java.util.Collections;
|
||||||
|
import java.util.Comparator;
|
||||||
|
import java.util.LinkedList;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.Map;
|
||||||
|
import java.util.Optional;
|
||||||
|
import java.util.TreeMap;
|
||||||
|
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling;
|
||||||
|
|
||||||
|
import lombok.experimental.UtilityClass;
|
||||||
|
import lombok.extern.slf4j.Slf4j;
|
||||||
|
|
||||||
|
@Slf4j
|
||||||
|
@UtilityClass
|
||||||
|
public class RulingIntersectionFinder {
|
||||||
|
|
||||||
|
public static final int PERPENDICULAR_UNIT_EXPAND_AMOUNT = 2;
|
||||||
|
|
||||||
|
public static final Comparator<Point2D> Y_THEN_X_POINT_COMPARATOR = Comparator.comparingDouble(Point2D::getY).thenComparing(Point2D::getX);
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Implementation to find line intersection in O(P + n log n), where n is the number of lines and P the numer of intersections.
|
||||||
|
* based on <a href="http://people.csail.mit.edu/indyk/6.838-old/handouts/lec2.pdf">Segment Intersection by Piotr Indyk</a>
|
||||||
|
*
|
||||||
|
* @param horizontals a list of non-overlapping horizontal rulings
|
||||||
|
* @param verticals a list of non-overlapping vertical rulings
|
||||||
|
* @return a Map of each found intersection point pointing to the two lines forming the intersection.
|
||||||
|
*/
|
||||||
|
/*
|
||||||
|
* The algorithm assumes there are only horizontal and vertical lines which are unique in their coordinates. (E.g. no overlapping horizontal lines exist)
|
||||||
|
* As a high level overview, the algorithm uses a sweep line advancing from left to right.
|
||||||
|
* It dynamically updates the horizontal rulings which are intersected by the current sweep line.
|
||||||
|
* When the sweep line hits a vertical line, it then checks for all intersections with the currently intersected horizontal rulings.
|
||||||
|
* THe trick of the algorithm is using a binary search tree to store the currently intersected horizontal rulings. This way the lookup should be in O(log n).
|
||||||
|
* This way the initial sorting step has the highest complexity class (O(n log n) and thus determines the complexity class of the entire algorithm
|
||||||
|
* Unfortunately, the implementation here takes a few liberties compared to the original algorithm. The binary search tree is replaced by an ordered Set which is simply looped over.
|
||||||
|
* Therefore, this implementation's worst case, where all horizontal lines span the entire sweep, you are essentially performing the naive approach with a bunch of overhead.
|
||||||
|
* Since we are using this implementation to find table cells, one can expect this worst case to always be the case.
|
||||||
|
* A simple runtime comparison for a single page with the most lines we can expect (SinglePages/AbsolutelyEnormousTable.pdf with 30 horizontals and 144 verticals) shows this implementation takes roughly 14 ms, whereas the naive approach takes 7 ms. Both are negligible, but the naive approach is two times as fast.
|
||||||
|
* If we would like to make this faster, we would need a better data structure for 'TreeMap<Ruling, Void> horizontalRulingsInCurrentSweep', where we can query the TreeMap for all horizontal rulings in a given interval in O(log n).
|
||||||
|
*/
|
||||||
|
public Map<Point2D, IntersectingRulings> find(List<Ruling> horizontals, List<Ruling> verticals) {
|
||||||
|
|
||||||
|
long start = System.currentTimeMillis();
|
||||||
|
List<SweepStep> sweepTrajectory = buildSweepTrajectory(horizontals, verticals);
|
||||||
|
|
||||||
|
TreeMap<Ruling, Void> horizontalRulingsInCurrentSweep = new TreeMap<>(Comparator.comparingDouble(Ruling::getTop));
|
||||||
|
|
||||||
|
TreeMap<Point2D, IntersectingRulings> intersections = new TreeMap<>(Y_THEN_X_POINT_COMPARATOR);
|
||||||
|
|
||||||
|
for (SweepStep step : sweepTrajectory) {
|
||||||
|
switch (step.type) {
|
||||||
|
case VERTICAL: // check for intersections with currently intersected horizontal lines
|
||||||
|
for (Ruling horizontalRuling : horizontalRulingsInCurrentSweep.navigableKeySet()) {
|
||||||
|
|
||||||
|
Optional<Point2D> intersectionPoint = findIntersectionPoint(horizontalRuling, step.ruling);
|
||||||
|
|
||||||
|
if (intersectionPoint.isEmpty()) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
intersections.put(intersectionPoint.get(), new IntersectingRulings(horizontalRuling, step.ruling));
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
case HORIZONTAL_ENTRY: // sweep line now intersects this horizontal ruling
|
||||||
|
horizontalRulingsInCurrentSweep.put(step.ruling, null);
|
||||||
|
break;
|
||||||
|
case HORIZONTAL_EXIT: // sweep line no longer intersects this horizontal ruling
|
||||||
|
horizontalRulingsInCurrentSweep.remove(step.ruling);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
log.debug("Finished building intersections with line sweep in {} ms", System.currentTimeMillis() - start);
|
||||||
|
|
||||||
|
return intersections;
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Naive Approach in O(n^2) of finding intersections between lines by iterating over all lines.
|
||||||
|
*
|
||||||
|
* @param horizontals a list of non-overlapping horizontal rulings
|
||||||
|
* @param verticals a list of non-overlapping vertical rulings
|
||||||
|
* @return a Map of each found intersection point pointing to the two lines forming the intersection.
|
||||||
|
*/
|
||||||
|
public Map<Point2D, IntersectingRulings> findNaive(List<Ruling> horizontals, List<Ruling> verticals) {
|
||||||
|
|
||||||
|
long start = System.currentTimeMillis();
|
||||||
|
TreeMap<Point2D, IntersectingRulings> intersections = new TreeMap<>(Y_THEN_X_POINT_COMPARATOR);
|
||||||
|
|
||||||
|
for (Ruling horizontal : horizontals) {
|
||||||
|
for (Ruling vertical : verticals) {
|
||||||
|
Optional<Point2D> intersectionPoint = findIntersectionPoint(horizontal, vertical);
|
||||||
|
|
||||||
|
if (intersectionPoint.isEmpty()) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
intersections.put(intersectionPoint.get(), new IntersectingRulings(horizontal, vertical));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
log.debug("Finished building intersections naively in {} ms", System.currentTimeMillis() - start);
|
||||||
|
|
||||||
|
return intersections;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private static List<SweepStep> buildSweepTrajectory(List<Ruling> horizontals, List<Ruling> verticals) {
|
||||||
|
|
||||||
|
List<SweepStep> sweepTrajectory = new LinkedList<>();
|
||||||
|
|
||||||
|
for (Ruling horizontalRuling : horizontals) {
|
||||||
|
sweepTrajectory.add(new SweepStep(SweepStep.Type.HORIZONTAL_ENTRY, horizontalRuling.getLeft() - PERPENDICULAR_UNIT_EXPAND_AMOUNT, horizontalRuling));
|
||||||
|
sweepTrajectory.add(new SweepStep(SweepStep.Type.HORIZONTAL_EXIT, horizontalRuling.getRight() + PERPENDICULAR_UNIT_EXPAND_AMOUNT, horizontalRuling));
|
||||||
|
}
|
||||||
|
|
||||||
|
for (Ruling verticalRuling : verticals) {
|
||||||
|
sweepTrajectory.add(new SweepStep(SweepStep.Type.VERTICAL, verticalRuling.getLeft(), verticalRuling));
|
||||||
|
}
|
||||||
|
|
||||||
|
Collections.sort(sweepTrajectory);
|
||||||
|
|
||||||
|
return sweepTrajectory;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public Optional<Point2D> findIntersectionPoint(Ruling horizontal, Ruling vertical) {
|
||||||
|
|
||||||
|
if (!horizontal.isHorizontal() || !vertical.isVertical()) {
|
||||||
|
log.warn("lines must be orthogonal, vertical and horizontal");
|
||||||
|
return Optional.empty();
|
||||||
|
}
|
||||||
|
|
||||||
|
Ruling expanded_horizontal = horizontal.expand(PERPENDICULAR_UNIT_EXPAND_AMOUNT);
|
||||||
|
Ruling expanded_vertical = vertical.expand(PERPENDICULAR_UNIT_EXPAND_AMOUNT);
|
||||||
|
|
||||||
|
if (!expanded_horizontal.intersectsLine(expanded_vertical)) {
|
||||||
|
return Optional.empty();
|
||||||
|
}
|
||||||
|
|
||||||
|
return Optional.of(new Point2D.Float(vertical.getLeft(), horizontal.getTop()));
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private class SweepStep implements Comparable<SweepStep> {
|
||||||
|
|
||||||
|
protected Type type;
|
||||||
|
protected float y_position;
|
||||||
|
protected Ruling ruling;
|
||||||
|
|
||||||
|
private enum Type {
|
||||||
|
VERTICAL,
|
||||||
|
HORIZONTAL_EXIT,
|
||||||
|
HORIZONTAL_ENTRY
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
SweepStep(Type type, float y_position, Ruling ruling) {
|
||||||
|
|
||||||
|
this.type = type;
|
||||||
|
this.y_position = y_position;
|
||||||
|
this.ruling = ruling;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public int compareTo(SweepStep other) {
|
||||||
|
|
||||||
|
int rv;
|
||||||
|
if (DoubleComparisons.feq(y_position, other.y_position)) {
|
||||||
|
if (type == SweepStep.Type.VERTICAL && other.type == SweepStep.Type.HORIZONTAL_ENTRY) {
|
||||||
|
rv = 1;
|
||||||
|
} else if (type == SweepStep.Type.VERTICAL && other.type == SweepStep.Type.HORIZONTAL_EXIT) {
|
||||||
|
rv = -1;
|
||||||
|
} else if (type == SweepStep.Type.HORIZONTAL_ENTRY && other.type == SweepStep.Type.VERTICAL) {
|
||||||
|
rv = -1;
|
||||||
|
} else if (type == SweepStep.Type.HORIZONTAL_EXIT && other.type == SweepStep.Type.VERTICAL) {
|
||||||
|
rv = 1;
|
||||||
|
} else {
|
||||||
|
rv = Double.compare(y_position, other.y_position);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
return Double.compare(y_position, other.y_position);
|
||||||
|
}
|
||||||
|
return rv;
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
public record IntersectingRulings(Ruling horizontal, Ruling vertical) {
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
@ -4,6 +4,7 @@ import static com.knecon.fforesight.service.layoutparser.processor.utils.Geometr
|
|||||||
import static com.knecon.fforesight.service.layoutparser.processor.utils.GeometricComparators.Y_FIRST_POINT_COMPARATOR;
|
import static com.knecon.fforesight.service.layoutparser.processor.utils.GeometricComparators.Y_FIRST_POINT_COMPARATOR;
|
||||||
|
|
||||||
import java.awt.geom.Point2D;
|
import java.awt.geom.Point2D;
|
||||||
|
import java.awt.geom.Rectangle2D;
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
import java.util.HashMap;
|
import java.util.HashMap;
|
||||||
import java.util.HashSet;
|
import java.util.HashSet;
|
||||||
@ -11,7 +12,7 @@ import java.util.List;
|
|||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
import java.util.Set;
|
import java.util.Set;
|
||||||
|
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.Rectangle;
|
import com.knecon.fforesight.service.layoutparser.processor.model.table.Cell;
|
||||||
|
|
||||||
public class SpreadsheetFinder {
|
public class SpreadsheetFinder {
|
||||||
|
|
||||||
@ -19,15 +20,15 @@ public class SpreadsheetFinder {
|
|||||||
private static final float AREA_TOLERANCE = 0.001f;
|
private static final float AREA_TOLERANCE = 0.001f;
|
||||||
|
|
||||||
|
|
||||||
public static List<Rectangle> findSpreadsheetsFromCells(List<? extends Rectangle> cells) {
|
public static List<Rectangle2D> findSpreadsheetsFromCells(List<Cell> cells) {
|
||||||
// via: http://stackoverflow.com/questions/13746284/merging-multiple-adjacent-rectangles-into-one-polygon
|
// via: http://stackoverflow.com/questions/13746284/merging-multiple-adjacent-rectangles-into-one-polygon
|
||||||
List<Rectangle> rectangles = new ArrayList<>();
|
List<Rectangle2D> rectangles = new ArrayList<>();
|
||||||
Set<Point2D> pointSet = new HashSet<>();
|
Set<Point2D> pointSet = new HashSet<>();
|
||||||
Map<Point2D, Point2D> edgesH = new HashMap<>();
|
Map<Point2D, Point2D> edgesH = new HashMap<>();
|
||||||
Map<Point2D, Point2D> edgesV = new HashMap<>();
|
Map<Point2D, Point2D> edgesV = new HashMap<>();
|
||||||
|
|
||||||
for (Rectangle cell : cells) {
|
for (Cell cell : cells) {
|
||||||
for (Point2D pt : cell.getPoints()) {
|
for (Point2D pt : getPoints(cell.getBBoxInitialUserSpace())) {
|
||||||
if (pointSet.contains(pt)) { // shared vertex, remove it
|
if (pointSet.contains(pt)) { // shared vertex, remove it
|
||||||
pointSet.remove(pt);
|
pointSet.remove(pt);
|
||||||
} else {
|
} else {
|
||||||
@ -116,13 +117,22 @@ public class SpreadsheetFinder {
|
|||||||
|
|
||||||
// do not add polygons with too many outer points as they are unlikely to be tables
|
// do not add polygons with too many outer points as they are unlikely to be tables
|
||||||
if (poly.size() <= MAX_OUTER_POINT_TOLERANCE) {
|
if (poly.size() <= MAX_OUTER_POINT_TOLERANCE) {
|
||||||
rectangles.add(new Rectangle(top - AREA_TOLERANCE, left - AREA_TOLERANCE, right - left + 2 * AREA_TOLERANCE, bottom - top + 2 * AREA_TOLERANCE));
|
rectangles.add(new Rectangle2D.Double(left - AREA_TOLERANCE, top - AREA_TOLERANCE, right - left + (2 * AREA_TOLERANCE), bottom - top + (2 * AREA_TOLERANCE)));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return rectangles;
|
return rectangles;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public static List<Point2D> getPoints(Rectangle2D rectangle2D) {
|
||||||
|
|
||||||
|
return List.of(new Point2D.Double(rectangle2D.getX(), rectangle2D.getY()),
|
||||||
|
new Point2D.Double(rectangle2D.getMaxX(), rectangle2D.getY()),
|
||||||
|
new Point2D.Double(rectangle2D.getMaxX(), rectangle2D.getMaxY()),
|
||||||
|
new Point2D.Double(rectangle2D.getX(), rectangle2D.getMaxY()));
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
private enum Direction {
|
private enum Direction {
|
||||||
HORIZONTAL,
|
HORIZONTAL,
|
||||||
VERTICAL
|
VERTICAL
|
||||||
|
|||||||
@ -39,21 +39,21 @@ public class TextPositionSequenceComparator implements Comparator<TextPositionSe
|
|||||||
}
|
}
|
||||||
|
|
||||||
// get the text direction adjusted coordinates
|
// get the text direction adjusted coordinates
|
||||||
float x1 = pos1.getMinXDirAdj();
|
double x1 = pos1.getBBox().getX();
|
||||||
float x2 = pos2.getMinXDirAdj();
|
double x2 = pos2.getBBox().getX();
|
||||||
|
|
||||||
float pos1YBottom = pos1.getMaxYDirAdj();
|
double pos1YBottom = pos1.getBBox().getMaxY();
|
||||||
float pos2YBottom = pos2.getMaxYDirAdj();
|
double pos2YBottom = pos2.getBBox().getMaxY();
|
||||||
|
|
||||||
// note that the coordinates have been adjusted so 0,0 is in upper left
|
// note that the coordinates have been adjusted so 0,0 is in upper left
|
||||||
float pos1YTop = pos1YBottom - pos1.getTextHeightNoPadding();
|
double pos1YTop = pos1YBottom - pos1.getBBox().getHeight();
|
||||||
float pos2YTop = pos2YBottom - pos2.getTextHeightNoPadding();
|
double pos2YTop = pos2YBottom - pos2.getBBox().getHeight();
|
||||||
|
|
||||||
float yDifference = Math.abs(pos1YBottom - pos2YBottom);
|
double yDifference = Math.abs(pos1YBottom - pos2YBottom);
|
||||||
|
|
||||||
// we will do a simple tolerance comparison
|
// we will do a simple tolerance comparison
|
||||||
if (yDifference < .1 || pos2YBottom >= pos1YTop && pos2YBottom <= pos1YBottom || pos1YBottom >= pos2YTop && pos1YBottom <= pos2YBottom) {
|
if (yDifference < .1 || pos2YBottom >= pos1YTop && pos2YBottom <= pos1YBottom || pos1YBottom >= pos2YTop && pos1YBottom <= pos2YBottom) {
|
||||||
return Float.compare(x1, x2);
|
return Double.compare(x1, x2);
|
||||||
} else if (pos1YBottom < pos2YBottom) {
|
} else if (pos1YBottom < pos2YBottom) {
|
||||||
return -1;
|
return -1;
|
||||||
} else {
|
} else {
|
||||||
|
|||||||
@ -0,0 +1,310 @@
|
|||||||
|
package com.knecon.fforesight.service.layoutparser.processor.visualization;
|
||||||
|
|
||||||
|
import java.awt.Color;
|
||||||
|
import java.awt.geom.Line2D;
|
||||||
|
import java.awt.geom.Point2D;
|
||||||
|
import java.awt.geom.Rectangle2D;
|
||||||
|
import java.util.Collection;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.concurrent.atomic.AtomicInteger;
|
||||||
|
import java.util.stream.Stream;
|
||||||
|
|
||||||
|
import org.apache.pdfbox.pdmodel.documentinterchange.markedcontent.PDMarkedContent;
|
||||||
|
|
||||||
|
import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.Rectangle;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.BoundingBox;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.Line;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.docstrum.model.Zone;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.utils.MarkedContentUtils;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations;
|
||||||
|
import com.knecon.fforesight.service.viewerdoc.ContentStreams;
|
||||||
|
import com.knecon.fforesight.service.viewerdoc.model.ColoredLine;
|
||||||
|
import com.knecon.fforesight.service.viewerdoc.model.ColoredRectangle;
|
||||||
|
import com.knecon.fforesight.service.viewerdoc.model.PlacedText;
|
||||||
|
import com.knecon.fforesight.service.viewerdoc.model.Standard14EmbeddableFont;
|
||||||
|
import com.knecon.fforesight.service.viewerdoc.model.Visualizations;
|
||||||
|
import com.knecon.fforesight.service.viewerdoc.model.VisualizationsOnPage;
|
||||||
|
|
||||||
|
import lombok.AccessLevel;
|
||||||
|
import lombok.AllArgsConstructor;
|
||||||
|
import lombok.Getter;
|
||||||
|
import lombok.NoArgsConstructor;
|
||||||
|
import lombok.Setter;
|
||||||
|
import lombok.experimental.FieldDefaults;
|
||||||
|
|
||||||
|
@Getter
|
||||||
|
@NoArgsConstructor
|
||||||
|
@AllArgsConstructor
|
||||||
|
@FieldDefaults(level = AccessLevel.PRIVATE)
|
||||||
|
public class LayoutparsingVisualizations {
|
||||||
|
|
||||||
|
static final Standard14EmbeddableFont FONT = Standard14EmbeddableFont.helvetica();
|
||||||
|
|
||||||
|
static final Color WORDS_COLOR = new Color(68, 84, 147);
|
||||||
|
static final Color LINES_COLOR = new Color(152, 45, 179);
|
||||||
|
static final Color ZONES_COLOR = new Color(131, 38, 38);
|
||||||
|
|
||||||
|
static final Color RULINGS_COLOR = new Color(21, 221, 174);
|
||||||
|
static final Color TABLE_RULINGS_COLOR = new Color(255, 175, 175);
|
||||||
|
static final Color HEADER_RULING_COLOR = new Color(171, 131, 6);
|
||||||
|
static final Color FOOTER_RULING_COLOR = new Color(106, 82, 2);
|
||||||
|
static final Color UNDERLINE_RULING_COLOR = new Color(6, 39, 171);
|
||||||
|
static final Color STRIKETROUGH_RULING_COLOR = new Color(171, 6, 6);
|
||||||
|
|
||||||
|
static final Color CELLS_COLOR = new Color(31, 214, 27);
|
||||||
|
|
||||||
|
static final Color MAIN_BODY_COLOR = new Color(171, 131, 6);
|
||||||
|
static final Color MARKED_CONTENT_COLOR = new Color(171, 131, 6);
|
||||||
|
|
||||||
|
static final List<Color> ROTATING_CHARACTER_COLOR = List.of(new Color(255, 87, 51),
|
||||||
|
new Color(255, 195, 0),
|
||||||
|
new Color(76, 175, 80),
|
||||||
|
new Color(33, 150, 243),
|
||||||
|
new Color(155, 89, 182),
|
||||||
|
new Color(233, 30, 99),
|
||||||
|
new Color(0, 188, 212),
|
||||||
|
new Color(121, 85, 72));
|
||||||
|
|
||||||
|
@Setter
|
||||||
|
boolean active;
|
||||||
|
|
||||||
|
final Visualizations words = Visualizations.builder().layer(ContentStreams.WORDS).build();
|
||||||
|
final Visualizations lines = Visualizations.builder().layer(ContentStreams.LINES).build();
|
||||||
|
final Visualizations zones = Visualizations.builder().layer(ContentStreams.ZONES).build();
|
||||||
|
final Visualizations mainBody = Visualizations.builder().layer(ContentStreams.MAIN_BODY).build();
|
||||||
|
final Visualizations clean_rulings = Visualizations.builder().layer(ContentStreams.CLEAN_RULINGS).build();
|
||||||
|
final Visualizations rulings = Visualizations.builder().layer(ContentStreams.RULINGS).build();
|
||||||
|
final Visualizations cells = Visualizations.builder().layer(ContentStreams.CELLS).build();
|
||||||
|
final Visualizations markedContent = Visualizations.builder().layer(ContentStreams.MARKED_CONTENT).build();
|
||||||
|
final Visualizations neighbours = Visualizations.builder().layer(ContentStreams.NEIGHBOURS).build();
|
||||||
|
final Visualizations characters = Visualizations.builder().layer(ContentStreams.CHARACTERS).build();
|
||||||
|
|
||||||
|
|
||||||
|
public Stream<Visualizations> streamAll() {
|
||||||
|
|
||||||
|
if (!active) {
|
||||||
|
return Stream.empty();
|
||||||
|
}
|
||||||
|
return Stream.of(characters, //
|
||||||
|
neighbours,//
|
||||||
|
words, //
|
||||||
|
lines, //
|
||||||
|
zones, //
|
||||||
|
rulings, //
|
||||||
|
clean_rulings, //
|
||||||
|
cells, //
|
||||||
|
mainBody, //
|
||||||
|
markedContent //
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public void addTextVisualizations(List<TextPositionSequence> textPositionSequences, int pageNumber) {
|
||||||
|
|
||||||
|
if (!active) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
VisualizationsOnPage visualizationsOnPage = getOrCreateVisualizationsOnPage(pageNumber, this.words);
|
||||||
|
visualizationsOnPage.getColoredRectangles()
|
||||||
|
.addAll(textPositionSequences.stream()
|
||||||
|
.map(BoundingBox::getBBoxInitialUserSpace)
|
||||||
|
.map(rect -> new ColoredRectangle(rect, WORDS_COLOR, 1))
|
||||||
|
.toList());
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public void addCleanRulingVisualization(CleanRulings cleanRulings, int pageNumber) {
|
||||||
|
|
||||||
|
if (!active) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
VisualizationsOnPage visualizationsOnPage = getOrCreateVisualizationsOnPage(pageNumber, this.clean_rulings);
|
||||||
|
visualizationsOnPage.getColoredLines()
|
||||||
|
.addAll(cleanRulings.buildAll()
|
||||||
|
.stream()
|
||||||
|
.map(ruling -> new ColoredLine(ruling, decideOnRulingColor(ruling), 0.5f))
|
||||||
|
.toList());
|
||||||
|
}
|
||||||
|
|
||||||
|
public void addRulingVisualization(List<Ruling> rulings, int pageNumber) {
|
||||||
|
|
||||||
|
if (!active) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
VisualizationsOnPage visualizationsOnPage = getOrCreateVisualizationsOnPage(pageNumber, this.rulings);
|
||||||
|
visualizationsOnPage.getColoredLines()
|
||||||
|
.addAll(rulings
|
||||||
|
.stream()
|
||||||
|
.map(ruling -> new ColoredLine(ruling, decideOnRulingColor(ruling), 0.5f))
|
||||||
|
.toList());
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private Color decideOnRulingColor(Ruling ruling) {
|
||||||
|
|
||||||
|
return switch (ruling.getClassification()) {
|
||||||
|
case TABLE_LINE -> TABLE_RULINGS_COLOR;
|
||||||
|
case HEADER_SEPARATOR -> HEADER_RULING_COLOR;
|
||||||
|
case FOOTER_SEPARATOR -> FOOTER_RULING_COLOR;
|
||||||
|
case UNDERLINE -> UNDERLINE_RULING_COLOR;
|
||||||
|
case STRIKETROUGH -> STRIKETROUGH_RULING_COLOR;
|
||||||
|
default -> RULINGS_COLOR;
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public void addCellVisualizations(List<? extends BoundingBox> cells, int pageNumber) {
|
||||||
|
|
||||||
|
if (!active) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
VisualizationsOnPage visualizationsOnPage = getOrCreateVisualizationsOnPage(pageNumber, this.cells);
|
||||||
|
visualizationsOnPage.getColoredRectangles()
|
||||||
|
.addAll(cells.stream()
|
||||||
|
.map(cell -> new ColoredRectangle(cell.getBBoxInitialUserSpace(), CELLS_COLOR, 1))
|
||||||
|
.toList());
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public void addZoneVisualizations(List<Zone> zones, int page) {
|
||||||
|
|
||||||
|
if (!active) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
VisualizationsOnPage visualizationsOnPage = getOrCreateVisualizationsOnPage(page, this.zones);
|
||||||
|
visualizationsOnPage.getColoredRectangles()
|
||||||
|
.addAll(zones.stream()
|
||||||
|
.map(BoundingBox::getBBoxInitialUserSpace)
|
||||||
|
.map(zone -> new ColoredRectangle(zone, ZONES_COLOR, 1))
|
||||||
|
.toList());
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public void addLineVisualizationsFromZones(List<Zone> zones, int page) {
|
||||||
|
|
||||||
|
addLineVisualizations(zones.stream()
|
||||||
|
.map(Zone::getLines)
|
||||||
|
.flatMap(Collection::stream)
|
||||||
|
.toList(), page);
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public void addLineVisualizations(List<Line> lines, int page) {
|
||||||
|
|
||||||
|
if (!active) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
VisualizationsOnPage visualizationsOnPage = getOrCreateVisualizationsOnPage(page, this.lines);
|
||||||
|
visualizationsOnPage.getColoredRectangles()
|
||||||
|
.addAll(lines.stream()
|
||||||
|
.map(BoundingBox::getBBoxInitialUserSpace)
|
||||||
|
.map(line -> new ColoredRectangle(line, LINES_COLOR, 0.5f))
|
||||||
|
.toList());
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public void addTextBlockVisualizations(List<TextPageBlock> textPageBlocks, int page) {
|
||||||
|
|
||||||
|
if (!active) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
VisualizationsOnPage visualizationsOnPage = getOrCreateVisualizationsOnPage(page, zones);
|
||||||
|
visualizationsOnPage.getColoredRectangles()
|
||||||
|
.addAll(textPageBlocks.stream()
|
||||||
|
.map(rect -> new ColoredRectangle(rect.getBBoxInitialUserSpace(), ZONES_COLOR, 1))
|
||||||
|
.toList());
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public void addMainBodyVisualization(Rectangle rectangle, int pageNumber) {
|
||||||
|
|
||||||
|
if (!active) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
VisualizationsOnPage visualizationsOnPage = getOrCreateVisualizationsOnPage(pageNumber, mainBody);
|
||||||
|
visualizationsOnPage.getColoredRectangles()
|
||||||
|
.add(new ColoredRectangle(new Rectangle2D.Double(rectangle.getTopLeft().getX(), rectangle.getTopLeft().getY(), rectangle.getWidth(), rectangle.getHeight()),
|
||||||
|
MAIN_BODY_COLOR,
|
||||||
|
1));
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public void addMarkedContentVisualizations(List<PDMarkedContent> markedContents, int pageNumber) {
|
||||||
|
|
||||||
|
if (!active) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
VisualizationsOnPage visualizationsOnPage = getOrCreateVisualizationsOnPage(pageNumber, markedContent);
|
||||||
|
|
||||||
|
List<MarkedContentUtils.MarkedContentPosition> markedContentBBoxMapBySubType = MarkedContentUtils.getMarkedContentPositions(markedContents);
|
||||||
|
AtomicInteger count = new AtomicInteger();
|
||||||
|
markedContentBBoxMapBySubType.forEach(markedContentPosition -> {
|
||||||
|
var bbox = markedContentPosition.textPositions()
|
||||||
|
.stream()
|
||||||
|
.collect(RectangleTransformations.collectBBox());
|
||||||
|
String type = markedContentPosition.formattedType() + " " + count.getAndIncrement();
|
||||||
|
|
||||||
|
float translationAmount = ((FONT.getStringWidth(type) / 100) + 6);
|
||||||
|
// Pushes the string to the left of the box: calculate string width, divide by font units (1000), multiply with font size (10), add small offset (6).
|
||||||
|
|
||||||
|
visualizationsOnPage.getPlacedTexts()
|
||||||
|
.add(PlacedText.textFacingUp(type, new Point2D.Double(bbox.getX() - translationAmount, bbox.getY() + bbox.getHeight()), 10, Color.BLACK, FONT));
|
||||||
|
|
||||||
|
visualizationsOnPage.getColoredRectangles().add(new ColoredRectangle(bbox, MARKED_CONTENT_COLOR, 1));
|
||||||
|
}
|
||||||
|
|
||||||
|
);
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public void addCharactersWithNeighbours(List<Zone> zones, int page) {
|
||||||
|
|
||||||
|
if (!active) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
VisualizationsOnPage characterVisualizations = getOrCreateVisualizationsOnPage(page, characters);
|
||||||
|
VisualizationsOnPage neighbourVisualizations = getOrCreateVisualizationsOnPage(page, neighbours);
|
||||||
|
|
||||||
|
AtomicInteger index = new AtomicInteger(0);
|
||||||
|
zones.forEach(zone -> zone.getLines()
|
||||||
|
.stream()
|
||||||
|
.map(Line::getCharacters)
|
||||||
|
.flatMap(Collection::stream)
|
||||||
|
.forEach(character -> {
|
||||||
|
Color color = ROTATING_CHARACTER_COLOR.get(index.getAndIncrement() % ROTATING_CHARACTER_COLOR.size());
|
||||||
|
Rectangle2D charBBox = character.getTextPosition().getBBoxInitialUserSpace();
|
||||||
|
characterVisualizations.getColoredRectangles().add(new ColoredRectangle(charBBox, color, 1));
|
||||||
|
character.getNeighbors()
|
||||||
|
.forEach(neighbor -> {
|
||||||
|
Rectangle2D neighborBBox = neighbor.getCharacter().getTextPosition().getBBoxInitialUserSpace();
|
||||||
|
Line2D line = new Line2D.Double(new Point2D.Double(charBBox.getCenterX(), charBBox.getCenterY()),
|
||||||
|
new Point2D.Double(neighborBBox.getCenterX(), neighborBBox.getCenterY()));
|
||||||
|
neighbourVisualizations.getColoredLines().add(new ColoredLine(line, color, 1));
|
||||||
|
});
|
||||||
|
}));
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private VisualizationsOnPage getOrCreateVisualizationsOnPage(int page, Visualizations visualizations) {
|
||||||
|
|
||||||
|
if (visualizations.getVisualizationsOnPages().containsKey(page - 1)) {
|
||||||
|
return visualizations.getVisualizationsOnPages()
|
||||||
|
.get(page - 1);
|
||||||
|
}
|
||||||
|
VisualizationsOnPage visualizationsOnPage = VisualizationsOnPage.builder().build();
|
||||||
|
visualizations.getVisualizationsOnPages().put(page - 1, visualizationsOnPage);
|
||||||
|
return visualizationsOnPage;
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
@ -69,10 +69,10 @@ public class HeadlinesGoldStandardIntegrationTest {
|
|||||||
public void testHeadlineDetection() {
|
public void testHeadlineDetection() {
|
||||||
|
|
||||||
List<Metrics> metrics = new ArrayList<>();
|
List<Metrics> metrics = new ArrayList<>();
|
||||||
//metrics.add(getMetrics("files/syngenta/CustomerFiles/01 - CGA100251 - Acute Oral Toxicity (Up and Down Procedure) - Rat (1).pdf",
|
metrics.add(getMetrics("files/syngenta/CustomerFiles/01 - CGA100251 - Acute Oral Toxicity (Up and Down Procedure) - Rat (1).pdf",
|
||||||
// "files/headlineTest/01 - CGA100251 - Acute Oral Toxicity (Up and Down Procedure) - Rat (1)_REDACTION_LOG.json"));
|
"files/headlineTest/01 - CGA100251 - Acute Oral Toxicity (Up and Down Procedure) - Rat (1)_REDACTION_LOG.json"));
|
||||||
//metrics.add(getMetrics("files/syngenta/CustomerFiles/91 Trinexapac-ethyl_RAR_01_Volume_1_2018-02-23.pdf",
|
metrics.add(getMetrics("files/syngenta/CustomerFiles/91 Trinexapac-ethyl_RAR_01_Volume_1_2018-02-23.pdf",
|
||||||
// "files/headlineTest/91 Trinexapac-ethyl_RAR_01_Volume_1_2018-02-23_REDACTION_LOG.json"));
|
"files/headlineTest/91 Trinexapac-ethyl_RAR_01_Volume_1_2018-02-23_REDACTION_LOG.json"));
|
||||||
metrics.add(getMetrics("files/syngenta/CustomerFiles/S-Metolachlor_RAR_01_Volume_1_2018-09-06.pdf", "files/headlineTest/S-Metolachlor_RAR_01_Volume_1_2018-09-06_REDACTION_LOG.json"));
|
metrics.add(getMetrics("files/syngenta/CustomerFiles/S-Metolachlor_RAR_01_Volume_1_2018-09-06.pdf", "files/headlineTest/S-Metolachlor_RAR_01_Volume_1_2018-09-06_REDACTION_LOG.json"));
|
||||||
|
|
||||||
double precision = metrics.stream().mapToDouble(Metrics::getPrecision).average().orElse(1.0);
|
double precision = metrics.stream().mapToDouble(Metrics::getPrecision).average().orElse(1.0);
|
||||||
@ -96,8 +96,8 @@ public class HeadlinesGoldStandardIntegrationTest {
|
|||||||
goldStandardLog.getRedactionLogEntry().removeIf(r -> !r.isRedacted() || r.getChanges().get(r.getChanges().size() - 1).getType().equals(ChangeType.REMOVED));
|
goldStandardLog.getRedactionLogEntry().removeIf(r -> !r.isRedacted() || r.getChanges().get(r.getChanges().size() - 1).getType().equals(ChangeType.REMOVED));
|
||||||
goldStandardLog.getRedactionLogEntry().forEach(e -> goldStandardHeadlines.add(new Headline(e.getPositions().get(0).getPage(), e.getValue())));
|
goldStandardLog.getRedactionLogEntry().forEach(e -> goldStandardHeadlines.add(new Headline(e.getPositions().get(0).getPage(), e.getValue())));
|
||||||
|
|
||||||
Document documentGraph = DocumentGraphFactory.buildDocumentGraph(LayoutParsingType.REDACT_MANAGER,
|
Document documentGraph = DocumentGraphFactory.buildDocumentGraph(LayoutParsingType.DOCUMINE,
|
||||||
layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER,
|
layoutParsingPipeline.parseLayout(LayoutParsingType.DOCUMINE,
|
||||||
pdfFileResource.getFile(),
|
pdfFileResource.getFile(),
|
||||||
new ImageServiceResponse(),
|
new ImageServiceResponse(),
|
||||||
new TableServiceResponse(),
|
new TableServiceResponse(),
|
||||||
|
|||||||
@ -1,10 +1,20 @@
|
|||||||
package com.knecon.fforesight.service.layoutparser.server;
|
package com.knecon.fforesight.service.layoutparser.server;
|
||||||
|
|
||||||
|
import java.io.File;
|
||||||
|
import java.nio.file.Files;
|
||||||
|
import java.nio.file.Path;
|
||||||
import java.util.Arrays;
|
import java.util.Arrays;
|
||||||
|
import java.util.Comparator;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.concurrent.atomic.AtomicInteger;
|
||||||
|
|
||||||
|
import org.junit.jupiter.api.AfterEach;
|
||||||
|
import org.junit.jupiter.api.Disabled;
|
||||||
import org.junit.jupiter.api.Test;
|
import org.junit.jupiter.api.Test;
|
||||||
import org.springframework.beans.factory.annotation.Autowired;
|
import org.springframework.beans.factory.annotation.Autowired;
|
||||||
|
import org.springframework.core.io.ClassPathResource;
|
||||||
|
|
||||||
|
import com.iqser.red.storage.commons.utils.FileSystemBackedStorageService;
|
||||||
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingFinishedEvent;
|
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingFinishedEvent;
|
||||||
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingRequest;
|
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingRequest;
|
||||||
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType;
|
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType;
|
||||||
@ -20,28 +30,65 @@ public class LayoutparserEnd2EndTest extends AbstractTest {
|
|||||||
@Autowired
|
@Autowired
|
||||||
private LayoutParsingPipeline layoutParsingPipeline;
|
private LayoutParsingPipeline layoutParsingPipeline;
|
||||||
|
|
||||||
|
@Disabled
|
||||||
@Test
|
@Test
|
||||||
@SneakyThrows
|
|
||||||
public void testLayoutParserEndToEnd() {
|
public void testLayoutParserEndToEnd() {
|
||||||
|
|
||||||
prepareStorage("files/bdr/Wie weiter bei Kristeneinrichtungen.pdf");
|
String filePath = "files/Minimal Examples/RotateTextWithRulingsTestFile.pdf";
|
||||||
LayoutParsingRequest layoutParsingRequest = buildDefaultLayoutParsingRequest(LayoutParsingType.REDACT_MANAGER);
|
|
||||||
LayoutParsingFinishedEvent finishedEvent = layoutParsingPipeline.parseLayoutAndSaveFilesToStorage(layoutParsingRequest);
|
runForFile(filePath);
|
||||||
Arrays.stream(finishedEvent.message().split("\n"))
|
}
|
||||||
.forEach(log::info);
|
|
||||||
|
@Test
|
||||||
|
@Disabled
|
||||||
|
@SneakyThrows
|
||||||
|
public void testLayoutParserEndToEndWithFolder() {
|
||||||
|
|
||||||
|
String folder = "/home/kschuettler/Dokumente/TestFiles/large number of prod files";
|
||||||
|
List<Path> pdfFiles = Files.walk(Path.of(folder))
|
||||||
|
.filter(path -> path.getFileName().toString().endsWith(".pdf"))
|
||||||
|
.sorted(Comparator.comparing(Path::getFileName))
|
||||||
|
.peek(System.out::println)
|
||||||
|
.toList();
|
||||||
|
|
||||||
|
System.out.printf("Found %d pdf files to process %n", pdfFiles.size());
|
||||||
|
AtomicInteger count = new AtomicInteger(0);
|
||||||
|
pdfFiles.stream()
|
||||||
|
.peek(path -> log.info("{}/{}-{}", count.getAndIncrement(), pdfFiles.size(), path.getFileName()))
|
||||||
|
.forEach(path -> runForFile(path.toFile().toString()));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@Test
|
|
||||||
@SneakyThrows
|
@SneakyThrows
|
||||||
public void testLayoutParserEndToEnd_RED_8747() {
|
private void runForFile(String filePath) {
|
||||||
|
|
||||||
|
String fileName = Path.of(filePath).getFileName().toString();
|
||||||
|
File file;
|
||||||
|
if (filePath.startsWith("files")) { // from resources
|
||||||
|
file = new ClassPathResource(filePath).getFile();
|
||||||
|
} else { // absolute path
|
||||||
|
file = new File(filePath);
|
||||||
|
}
|
||||||
|
|
||||||
|
LayoutParsingRequest layoutParsingRequest = buildDefaultLayoutParsingRequest(fileName, LayoutParsingType.REDACT_MANAGER, true);
|
||||||
|
prepareStorage(layoutParsingRequest, file);
|
||||||
|
|
||||||
prepareStorage("files/syngenta/CustomerFiles/SinglePages/Page26_fRR A23317A PI0015600 CEU core part B6 - CZ.pdf");
|
|
||||||
LayoutParsingRequest layoutParsingRequest = buildDefaultLayoutParsingRequest(LayoutParsingType.REDACT_MANAGER_OLD);
|
|
||||||
LayoutParsingFinishedEvent finishedEvent = layoutParsingPipeline.parseLayoutAndSaveFilesToStorage(layoutParsingRequest);
|
LayoutParsingFinishedEvent finishedEvent = layoutParsingPipeline.parseLayoutAndSaveFilesToStorage(layoutParsingRequest);
|
||||||
|
|
||||||
Arrays.stream(finishedEvent.message().split("\n"))
|
Arrays.stream(finishedEvent.message().split("\n"))
|
||||||
.forEach(log::info);
|
.forEach(log::info);
|
||||||
|
|
||||||
|
File tmpFile = new File("/tmp/layoutparserEND2END/" + fileName + "_VIEWER.pdf");
|
||||||
|
assert tmpFile.getParentFile().exists() || tmpFile.getParentFile().mkdirs();
|
||||||
|
|
||||||
|
storageService.downloadTo(TENANT_ID, layoutParsingRequest.viewerDocumentStorageId(), tmpFile);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@AfterEach
|
||||||
|
public void cleanUpTmp() {
|
||||||
|
|
||||||
|
((FileSystemBackedStorageService) storageService).clearStorage();
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|||||||
@ -23,6 +23,10 @@ import lombok.SneakyThrows;
|
|||||||
|
|
||||||
public class ViewerDocumentTest extends BuildDocumentTest {
|
public class ViewerDocumentTest extends BuildDocumentTest {
|
||||||
|
|
||||||
|
ViewerDocumentService viewerDocumentService = new ViewerDocumentService(null);
|
||||||
|
LayoutGridService layoutGridService = new LayoutGridService(viewerDocumentService);
|
||||||
|
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
@SneakyThrows
|
@SneakyThrows
|
||||||
public void testViewerDocument() {
|
public void testViewerDocument() {
|
||||||
@ -32,12 +36,10 @@ public class ViewerDocumentTest extends BuildDocumentTest {
|
|||||||
String tmpFileName = "/tmp/" + Path.of(fileName).getFileName() + "_VIEWER.pdf";
|
String tmpFileName = "/tmp/" + Path.of(fileName).getFileName() + "_VIEWER.pdf";
|
||||||
|
|
||||||
var documentFile = new ClassPathResource(fileName).getFile();
|
var documentFile = new ClassPathResource(fileName).getFile();
|
||||||
ViewerDocumentService viewerDocumentService = new ViewerDocumentService(null);
|
|
||||||
LayoutGridService layoutGridService = new LayoutGridService(viewerDocumentService);
|
|
||||||
|
|
||||||
long start = System.currentTimeMillis();
|
long start = System.currentTimeMillis();
|
||||||
Document document = buildGraph(fileName, LayoutParsingType.DOCUMINE);
|
Document document = buildGraph(fileName, LayoutParsingType.REDACT_MANAGER);
|
||||||
layoutGridService.addLayoutGrid(documentFile, document, new File(tmpFileName), true);
|
layoutGridService.addLayoutGrid(documentFile, document, new File(tmpFileName), true, false);
|
||||||
System.out.printf("Total time: %.2fs%n", ((float) (System.currentTimeMillis() - start)) / 1000);
|
System.out.printf("Total time: %.2fs%n", ((float) (System.currentTimeMillis() - start)) / 1000);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -55,17 +57,17 @@ public class ViewerDocumentTest extends BuildDocumentTest {
|
|||||||
var tableResponse = mapper.readValue(new ClassPathResource(tableFileName).getInputStream(), TableServiceResponse.class);
|
var tableResponse = mapper.readValue(new ClassPathResource(tableFileName).getInputStream(), TableServiceResponse.class);
|
||||||
var documentFile = new ClassPathResource(fileName).getFile();
|
var documentFile = new ClassPathResource(fileName).getFile();
|
||||||
|
|
||||||
var classificationDocument = layoutParsingPipeline.parseLayout(LayoutParsingType.DOCUMINE,
|
var classificationDocument = layoutParsingPipeline.parseLayout(LayoutParsingType.DOCUMINE_OLD,
|
||||||
documentFile,
|
documentFile,
|
||||||
new ImageServiceResponse(),
|
new ImageServiceResponse(),
|
||||||
tableResponse,
|
tableResponse,
|
||||||
new VisualLayoutParsingResponse(),
|
new VisualLayoutParsingResponse(),
|
||||||
Map.of("file", Path.of(fileName).getFileName().toFile().toString()));
|
Map.of("file", Path.of(fileName).getFileName().toFile().toString()));
|
||||||
ViewerDocumentService viewerDocumentService = new ViewerDocumentService(null);
|
ViewerDocumentService viewerDocumentService = new ViewerDocumentService(null);
|
||||||
LayoutGridService layoutGridService = new LayoutGridService(viewerDocumentService);
|
LayoutGridService layoutGridService = new LayoutGridService(viewerDocumentService);
|
||||||
Document document = DocumentGraphFactory.buildDocumentGraph(LayoutParsingType.DOCUMINE, classificationDocument);
|
Document document = DocumentGraphFactory.buildDocumentGraph(LayoutParsingType.DOCUMINE_OLD, classificationDocument);
|
||||||
|
|
||||||
layoutGridService.addLayoutGrid(documentFile, document, new File(tmpFileName), true);
|
layoutGridService.addLayoutGrid(documentFile, document, new File(tmpFileName), true, false);
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|||||||
@ -0,0 +1,118 @@
|
|||||||
|
package com.knecon.fforesight.service.layoutparser.server.model;
|
||||||
|
|
||||||
|
import static org.junit.jupiter.api.Assertions.assertEquals;
|
||||||
|
import static org.junit.jupiter.api.Assertions.assertFalse;
|
||||||
|
import static org.junit.jupiter.api.Assertions.assertTrue;
|
||||||
|
|
||||||
|
import java.awt.geom.Point2D;
|
||||||
|
import java.awt.geom.Rectangle2D;
|
||||||
|
import java.util.Collections;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.stream.IntStream;
|
||||||
|
|
||||||
|
import org.junit.jupiter.api.Test;
|
||||||
|
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling;
|
||||||
|
|
||||||
|
class CleanRulingsTest {
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testLineBetween() {
|
||||||
|
|
||||||
|
List<Ruling> verticals = List.of(new Ruling(new Point2D.Double(10, 0), new Point2D.Double(10, 10)));
|
||||||
|
List<Ruling> horizontals = List.of(new Ruling(new Point2D.Double(0, 5), new Point2D.Double(10, 5)));
|
||||||
|
CleanRulings cleanRulings = new CleanRulings(horizontals, verticals);
|
||||||
|
Rectangle2D a = new Rectangle2D.Double(1, 6, 3, 3);
|
||||||
|
Rectangle2D b = new Rectangle2D.Double(5, 6, 3, 3);
|
||||||
|
Rectangle2D c = new Rectangle2D.Double(11, 6, 3, 3);
|
||||||
|
Rectangle2D d = new Rectangle2D.Double(1, 1, 3, 3);
|
||||||
|
Rectangle2D e = new Rectangle2D.Double(5, 1, 3, 3);
|
||||||
|
Rectangle2D f = new Rectangle2D.Double(11, 1, 3, 3);
|
||||||
|
|
||||||
|
assertFalse(cleanRulings.lineBetween(a, a));
|
||||||
|
assertFalse(cleanRulings.lineBetween(a, b));
|
||||||
|
assertTrue(cleanRulings.lineBetween(a, c));
|
||||||
|
assertTrue(cleanRulings.lineBetween(a, d));
|
||||||
|
assertTrue(cleanRulings.lineBetween(a, e));
|
||||||
|
assertTrue(cleanRulings.lineBetween(a, f));
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testSingleLineInRange() {
|
||||||
|
|
||||||
|
List<Ruling> horizontals = List.of(new Ruling(new Point2D.Float(0, 1), new Point2D.Float(100, 1)));
|
||||||
|
List<Ruling> verticals = List.of(new Ruling(new Point2D.Float(1, 0), new Point2D.Float(1, 100)));
|
||||||
|
|
||||||
|
CleanRulings cleanRulings = new CleanRulings(horizontals, verticals);
|
||||||
|
|
||||||
|
assertEquals(0, cleanRulings.getVerticalsInXInterval(-2, -1).size());
|
||||||
|
assertEquals(0, cleanRulings.getVerticalsInXInterval(-2, -5).size());
|
||||||
|
assertEquals(0, cleanRulings.getVerticalsInXInterval(-2, Float.NaN).size());
|
||||||
|
assertEquals(1, cleanRulings.getVerticalsInXInterval(1, 10).size());
|
||||||
|
assertEquals(0, cleanRulings.getVerticalsInXInterval(100, 101).size());
|
||||||
|
assertEquals(verticals.size(), cleanRulings.getVerticalsInXInterval(Float.NEGATIVE_INFINITY, Float.POSITIVE_INFINITY).size());
|
||||||
|
assertEquals(1, cleanRulings.getVerticalsInXInterval(1 - 1e-5f, 1 + 1e-5f).size());
|
||||||
|
assertEquals(0, cleanRulings.getVerticalsInXInterval(1e-5f, 1 - 1e-5f).size());
|
||||||
|
|
||||||
|
assertEquals(0, cleanRulings.getHorizontalsInYInterval(-2, -1).size());
|
||||||
|
assertEquals(1, cleanRulings.getHorizontalsInYInterval(1, 10).size());
|
||||||
|
assertEquals(0, cleanRulings.getHorizontalsInYInterval(100, 1001).size());
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testLinesInRange() {
|
||||||
|
|
||||||
|
List<Ruling> horizontals = IntStream.range(0, 101).boxed()
|
||||||
|
.map(y -> new Ruling(new Point2D.Float(0, y), new Point2D.Float(100, y)))
|
||||||
|
.toList();
|
||||||
|
List<Ruling> verticals = IntStream.range(0, 101).boxed()
|
||||||
|
.map(x -> new Ruling(new Point2D.Float(x, 0), new Point2D.Float(x, 100)))
|
||||||
|
.toList();
|
||||||
|
CleanRulings cleanRulings = new CleanRulings(horizontals, verticals);
|
||||||
|
|
||||||
|
assertEquals(0, cleanRulings.getVerticalsInXInterval(-2, -1).size());
|
||||||
|
assertEquals(0, cleanRulings.getVerticalsInXInterval(-2, -5).size());
|
||||||
|
assertEquals(0, cleanRulings.getVerticalsInXInterval(-2, Float.NaN).size());
|
||||||
|
assertEquals(10, cleanRulings.getVerticalsInXInterval(1, 10).size());
|
||||||
|
assertEquals(1, cleanRulings.getVerticalsInXInterval(100, 101).size());
|
||||||
|
assertEquals(verticals.size(), cleanRulings.getVerticalsInXInterval(Float.NEGATIVE_INFINITY, Float.POSITIVE_INFINITY).size());
|
||||||
|
assertEquals(1, cleanRulings.getVerticalsInXInterval(-1e-5f, 1e-5f).size());
|
||||||
|
assertEquals(1, cleanRulings.getVerticalsInXInterval(0, 0).size());
|
||||||
|
assertEquals(0, cleanRulings.getVerticalsInXInterval(1e-5f, 1 - 1e-5f).size());
|
||||||
|
|
||||||
|
assertEquals(0, cleanRulings.getHorizontalsInYInterval(-2, -1).size());
|
||||||
|
assertEquals(10, cleanRulings.getHorizontalsInYInterval(1, 10).size());
|
||||||
|
assertEquals(1, cleanRulings.getHorizontalsInYInterval(100, 1001).size());
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testLinesInRangePerformance() {
|
||||||
|
|
||||||
|
List<Ruling> horizontals = IntStream.range(0, (int) 1e6).boxed()
|
||||||
|
.map(y -> new Ruling(new Point2D.Float(0, y), new Point2D.Float(100, y)))
|
||||||
|
.toList();
|
||||||
|
CleanRulings cleanRulings = new CleanRulings(horizontals, Collections.emptyList());
|
||||||
|
|
||||||
|
float startY = 29;
|
||||||
|
float endY = 3000;
|
||||||
|
long start = System.currentTimeMillis();
|
||||||
|
var result = cleanRulings.getHorizontalsInYInterval(startY, endY);
|
||||||
|
long time = System.currentTimeMillis() - start;
|
||||||
|
|
||||||
|
start = System.currentTimeMillis();
|
||||||
|
var result2 = cleanRulings.getHorizontals()
|
||||||
|
.stream()
|
||||||
|
.filter(ruling -> ruling.getY1() >= startY && ruling.getY1() <= endY)
|
||||||
|
.toList();
|
||||||
|
long time2 = System.currentTimeMillis() - start;
|
||||||
|
|
||||||
|
assertEquals(result, result2);
|
||||||
|
assertTrue(time < time2);
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
@ -0,0 +1,62 @@
|
|||||||
|
package com.knecon.fforesight.service.layoutparser.server.model;
|
||||||
|
|
||||||
|
import static org.junit.jupiter.api.Assertions.assertEquals;
|
||||||
|
import static org.junit.jupiter.api.Assertions.assertFalse;
|
||||||
|
import static org.junit.jupiter.api.Assertions.assertTrue;
|
||||||
|
|
||||||
|
import java.awt.geom.Point2D;
|
||||||
|
import java.awt.geom.Rectangle2D;
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
import org.junit.jupiter.api.Test;
|
||||||
|
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling;
|
||||||
|
|
||||||
|
public class RulingTest {
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testLineBetween() {
|
||||||
|
|
||||||
|
List<Ruling> verticals = List.of(new Ruling(new Point2D.Double(10, 0), new Point2D.Double(10, 10)), new Ruling(new Point2D.Double(5, 0), new Point2D.Double(5, 5)));
|
||||||
|
List<Ruling> horizontals = List.of(new Ruling(new Point2D.Double(0, 5), new Point2D.Double(10, 5)));
|
||||||
|
|
||||||
|
CleanRulings cleanRulings = new CleanRulings(horizontals, verticals);
|
||||||
|
|
||||||
|
Rectangle2D a = new Rectangle2D.Double(1, 6, 3, 3);
|
||||||
|
Rectangle2D b = new Rectangle2D.Double(5, 6, 3, 3);
|
||||||
|
Rectangle2D c = new Rectangle2D.Double(11, 6, 3, 3);
|
||||||
|
Rectangle2D d = new Rectangle2D.Double(1, 1, 3, 3);
|
||||||
|
Rectangle2D e = new Rectangle2D.Double(5, 1, 3, 3);
|
||||||
|
Rectangle2D f = new Rectangle2D.Double(11, 1, 3, 3);
|
||||||
|
|
||||||
|
assertFalse(cleanRulings.lineBetween(a, a));
|
||||||
|
assertFalse(cleanRulings.lineBetween(a, b));
|
||||||
|
assertTrue(cleanRulings.lineBetween(a, c));
|
||||||
|
assertTrue(cleanRulings.lineBetween(a, d));
|
||||||
|
assertTrue(cleanRulings.lineBetween(a, e));
|
||||||
|
assertTrue(cleanRulings.lineBetween(a, f));
|
||||||
|
|
||||||
|
assertFalse(cleanRulings.lineBetween(d, d));
|
||||||
|
assertTrue(cleanRulings.lineBetween(d, b));
|
||||||
|
assertTrue(cleanRulings.lineBetween(d, c));
|
||||||
|
assertTrue(cleanRulings.lineBetween(d, a));
|
||||||
|
assertTrue(cleanRulings.lineBetween(d, e));
|
||||||
|
assertTrue(cleanRulings.lineBetween(d, f));
|
||||||
|
|
||||||
|
assertFalse(cleanRulings.lineBetween(c, c));
|
||||||
|
assertTrue(cleanRulings.lineBetween(c, b));
|
||||||
|
assertTrue(cleanRulings.lineBetween(c, d));
|
||||||
|
assertTrue(cleanRulings.lineBetween(c, a));
|
||||||
|
assertTrue(cleanRulings.lineBetween(c, e));
|
||||||
|
assertFalse(cleanRulings.lineBetween(c, f));
|
||||||
|
|
||||||
|
var all = List.of(a, b, c, d, e, f);
|
||||||
|
for (Rectangle2D r1 : all) {
|
||||||
|
for (Rectangle2D r2 : all) {
|
||||||
|
assertEquals(cleanRulings.lineBetween(r1, r2), cleanRulings.lineBetween(r2, r1));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
@ -52,28 +52,16 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
|||||||
@Autowired
|
@Autowired
|
||||||
private ObjectMapper objectMapper;
|
private ObjectMapper objectMapper;
|
||||||
|
|
||||||
@Autowired
|
|
||||||
private RedactManagerClassificationService redactManagerClassificationService;
|
|
||||||
|
|
||||||
@Autowired
|
|
||||||
private SectionsBuilderService sectionsBuilderService;
|
|
||||||
|
|
||||||
|
|
||||||
@SneakyThrows
|
@SneakyThrows
|
||||||
public ClassificationDocument buildClassificationDocument(File originDocument, TableServiceResponse tableServiceResponse) {
|
public ClassificationDocument buildClassificationDocument(File originDocument, TableServiceResponse tableServiceResponse) {
|
||||||
|
|
||||||
ClassificationDocument classificationDocument = layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER_OLD,
|
return layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER,
|
||||||
originDocument,
|
originDocument,
|
||||||
new ImageServiceResponse(),
|
new ImageServiceResponse(),
|
||||||
tableServiceResponse,
|
tableServiceResponse,
|
||||||
new VisualLayoutParsingResponse(),
|
new VisualLayoutParsingResponse(),
|
||||||
Map.of("file","document"));
|
Map.of("file", "document"));
|
||||||
|
|
||||||
redactManagerClassificationService.classifyDocument(classificationDocument);
|
|
||||||
|
|
||||||
sectionsBuilderService.buildSections(classificationDocument);
|
|
||||||
|
|
||||||
return classificationDocument;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@ -133,7 +121,7 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
|||||||
.get(0).getSequences().size()).isEqualTo(8);
|
.get(0).getSequences().size()).isEqualTo(8);
|
||||||
assertThat(classificationDocument.getHeaders()
|
assertThat(classificationDocument.getHeaders()
|
||||||
.get(0).getTextBlocks()
|
.get(0).getTextBlocks()
|
||||||
.get(0).toString()).isEqualTo(textToSearch);
|
.get(0).toString()).contains(textToSearch);
|
||||||
|
|
||||||
Document document = DocumentGraphFactory.buildDocumentGraph(LayoutParsingType.REDACT_MANAGER, classificationDocument);
|
Document document = DocumentGraphFactory.buildDocumentGraph(LayoutParsingType.REDACT_MANAGER, classificationDocument);
|
||||||
|
|
||||||
@ -143,6 +131,17 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@Test
|
||||||
|
@SneakyThrows
|
||||||
|
public void testTableAndCellRotations() {
|
||||||
|
String fileName = "files/Minimal Examples/simpleTablesRotated.pdf";
|
||||||
|
ClassPathResource pdfFileResource = new ClassPathResource(fileName);
|
||||||
|
|
||||||
|
ClassificationDocument classificationDocument = buildClassificationDocument(pdfFileResource.getFile());
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
@Disabled
|
@Disabled
|
||||||
@Test
|
@Test
|
||||||
public void testScanRotationBorderIsIgnored() throws IOException {
|
public void testScanRotationBorderIsIgnored() throws IOException {
|
||||||
@ -157,7 +156,11 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
|||||||
.flatMap(paragraph -> paragraph.getTables()
|
.flatMap(paragraph -> paragraph.getTables()
|
||||||
.stream())
|
.stream())
|
||||||
.collect(Collectors.toList())).isNotEmpty();
|
.collect(Collectors.toList())).isNotEmpty();
|
||||||
var tables = document.getSections().stream().flatMap(paragraph -> paragraph.getTables().stream()).toList();
|
var tables = document.getSections()
|
||||||
|
.stream()
|
||||||
|
.flatMap(paragraph -> paragraph.getTables()
|
||||||
|
.stream())
|
||||||
|
.toList();
|
||||||
|
|
||||||
// Quality of the table parsing is not good, because the file is rotated at scanning.
|
// Quality of the table parsing is not good, because the file is rotated at scanning.
|
||||||
// We only asset that the table border is not the page border.
|
// We only asset that the table border is not the page border.
|
||||||
@ -179,12 +182,12 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
|||||||
imageServiceResponse.getData()
|
imageServiceResponse.getData()
|
||||||
.forEach(imageMetadata -> images.computeIfAbsent(imageMetadata.getPosition().getPageNumber(), x -> new ArrayList<>())
|
.forEach(imageMetadata -> images.computeIfAbsent(imageMetadata.getPosition().getPageNumber(), x -> new ArrayList<>())
|
||||||
.add(new ClassifiedImage(new Rectangle2D.Double(imageMetadata.getPosition().getX1(),
|
.add(new ClassifiedImage(new Rectangle2D.Double(imageMetadata.getPosition().getX1(),
|
||||||
imageMetadata.getPosition().getY1(),
|
imageMetadata.getPosition().getY1(),
|
||||||
imageMetadata.getGeometry().getWidth(),
|
imageMetadata.getGeometry().getWidth(),
|
||||||
imageMetadata.getGeometry().getHeight()),
|
imageMetadata.getGeometry().getHeight()),
|
||||||
ImageType.valueOf(imageMetadata.getClassification().getLabel().toUpperCase(Locale.ROOT)),
|
ImageType.valueOf(imageMetadata.getClassification().getLabel().toUpperCase(Locale.ROOT)),
|
||||||
imageMetadata.isAlpha(),
|
imageMetadata.isAlpha(),
|
||||||
imageMetadata.getPosition().getPageNumber())));
|
imageMetadata.getPosition().getPageNumber())));
|
||||||
|
|
||||||
System.out.println("object");
|
System.out.println("object");
|
||||||
}
|
}
|
||||||
@ -196,11 +199,22 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
|||||||
ClassPathResource pdfFileResource = new ClassPathResource("files/syngenta/CustomerFiles/SinglePages/Spanning Cells - Page131_S-Metolachlor_RAR_01_Volume_1_2018-09-06.pdf");
|
ClassPathResource pdfFileResource = new ClassPathResource("files/syngenta/CustomerFiles/SinglePages/Spanning Cells - Page131_S-Metolachlor_RAR_01_Volume_1_2018-09-06.pdf");
|
||||||
|
|
||||||
ClassificationDocument document = buildClassificationDocument(pdfFileResource.getFile());
|
ClassificationDocument document = buildClassificationDocument(pdfFileResource.getFile());
|
||||||
assertThat(document.getSections().stream().flatMap(paragraph -> paragraph.getTables().stream()).collect(Collectors.toList())).isNotEmpty();
|
assertThat(document.getSections()
|
||||||
TablePageBlock table = document.getSections().stream().flatMap(paragraph -> paragraph.getTables().stream()).toList().get(0);
|
.stream()
|
||||||
|
.flatMap(paragraph -> paragraph.getTables()
|
||||||
|
.stream())
|
||||||
|
.collect(Collectors.toList())).isNotEmpty();
|
||||||
|
TablePageBlock table = document.getSections()
|
||||||
|
.stream()
|
||||||
|
.flatMap(paragraph -> paragraph.getTables()
|
||||||
|
.stream())
|
||||||
|
.toList()
|
||||||
|
.get(0);
|
||||||
assertThat(table.getColCount()).isEqualTo(6);
|
assertThat(table.getColCount()).isEqualTo(6);
|
||||||
assertThat(table.getRowCount()).isEqualTo(13);
|
assertThat(table.getRowCount()).isEqualTo(13);
|
||||||
assertThat(table.getRows().stream().mapToInt(List::size).sum()).isEqualTo(6 * 13);
|
assertThat(table.getRows()
|
||||||
|
.stream()
|
||||||
|
.mapToInt(List::size).sum()).isEqualTo(6 * 13);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@ -373,29 +387,30 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
|||||||
validateTable(document, 0, 8, 8, 0, 0);
|
validateTable(document, 0, 8, 8, 0, 0);
|
||||||
|
|
||||||
List<List<String>> values = Arrays.asList(Arrays.asList("Annex point Reference within DAR/RAR",
|
List<List<String>> values = Arrays.asList(Arrays.asList("Annex point Reference within DAR/RAR",
|
||||||
"Author, date",
|
"Author, date",
|
||||||
"Study title",
|
"Study title",
|
||||||
"Analytical method Author, date, No.",
|
"Analytical method Author, date, No.",
|
||||||
"Technique, LOQ of the method, validated working range",
|
"Technique, LOQ of the method, validated working range",
|
||||||
"Method meets analytical validation criteria",
|
"Method meets analytical validation criteria",
|
||||||
"Remarks (in case validation criteria are not met)",
|
"Remarks (in case validation criteria are not met)",
|
||||||
"Acceptability of the method"),
|
"Acceptability of the method"),
|
||||||
Arrays.asList("Part (a) Methods in soil, water, sediment, air and any additional matrices used in support of environmental fate studies",
|
Arrays.asList(
|
||||||
"Part (a) Methods in soil, water, sediment, air and any additional matrices used in support of environmental fate studies",
|
"Part (a) Methods in soil, water, sediment, air and any additional matrices used in support of environmental fate studies",
|
||||||
"Part (a) Methods in soil, water, sediment, air and any additional matrices used in support of environmental fate studies",
|
"Part (a) Methods in soil, water, sediment, air and any additional matrices used in support of environmental fate studies",
|
||||||
"Part (a) Methods in soil, water, sediment, air and any additional matrices used in support of environmental fate studies",
|
"Part (a) Methods in soil, water, sediment, air and any additional matrices used in support of environmental fate studies",
|
||||||
"Part (a) Methods in soil, water, sediment, air and any additional matrices used in support of environmental fate studies",
|
"Part (a) Methods in soil, water, sediment, air and any additional matrices used in support of environmental fate studies",
|
||||||
"Part (a) Methods in soil, water, sediment, air and any additional matrices used in support of environmental fate studies",
|
"Part (a) Methods in soil, water, sediment, air and any additional matrices used in support of environmental fate studies",
|
||||||
"Part (a) Methods in soil, water, sediment, air and any additional matrices used in support of environmental fate studies",
|
"Part (a) Methods in soil, water, sediment, air and any additional matrices used in support of environmental fate studies",
|
||||||
"Part (a) Methods in soil, water, sediment, air and any additional matrices used in support of environmental fate studies"),
|
"Part (a) Methods in soil, water, sediment, air and any additional matrices used in support of environmental fate studies",
|
||||||
Arrays.asList("CA 7.1.2.1.1 DAR (2009)",
|
"Part (a) Methods in soil, water, sediment, air and any additional matrices used in support of environmental fate studies"),
|
||||||
"Evans P.G. 2001 TMJ4569B, VV-323245",
|
Arrays.asList("CA 7.1.2.1.1 DAR (2009)",
|
||||||
"Azoxystrobin Laboratory Degradation Study in Three Soil Types, Sampled from Holland and the United Kingdom",
|
"Evans P.G. 2001 TMJ4569B, VV-323245",
|
||||||
"Method: RAM 269 Johnson R.I., Tummon O.J., Earl M. 1995 RJ1864B, VV-377731 Johnson R.I., Tummon O.J., Earl M. 1998 RAM 269/02, VV-124072 Johnson R.I., Tummon O.J., Earl M. 2000 RAM 269/03, VV-123986 Validation: Robinson N.J. 2001 TMJ4617B, VV-895845",
|
"Azoxystrobin Laboratory Degradation Study in Three Soil Types, Sampled from Holland and the United Kingdom",
|
||||||
"LC-MS/MS LOQ: 0.01 mg/kg (R401553 (SYN50165 7), R402173 (SYN501114 )) or 0.02 mg/kg (azoxystrobin, R230310, R234886) Working range: 0.02-1.0 or 0.01-0.5 mg/kg (depending on analyte) Other supporting quantificati on methods: HPLC-UV GC-MSD",
|
"Method: RAM 269 Johnson R.I., Tummon O.J., Earl M. 1995 RJ1864B, VV-377731 Johnson R.I., Tummon O.J., Earl M. 1998 RAM 269/02, VV-124072 Johnson R.I., Tummon O.J., Earl M. 2000 RAM 269/03, VV-123986 Validation: Robinson N.J. 2001 TMJ4617B, VV-895845",
|
||||||
"Y",
|
"LC-MS/MS LOQ: 0.01 mg/kg (R401553 (SYN50165 7), R402173 (SYN501114 )) or 0.02 mg/kg (azoxystrobin, R230310, R234886) Working range: 0.02-1.0 or 0.01-0.5 mg/kg (depending on analyte) Other supporting quantificati on methods: HPLC-UV GC-MSD",
|
||||||
"N/A",
|
"Y",
|
||||||
"Y"));
|
"N/A",
|
||||||
|
"Y"));
|
||||||
|
|
||||||
validateTable(document, 0, values);
|
validateTable(document, 0, values);
|
||||||
|
|
||||||
@ -785,6 +800,7 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
|||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void testMergedEntities_Page26() throws IOException {
|
public void testMergedEntities_Page26() throws IOException {
|
||||||
|
|
||||||
@ -802,7 +818,11 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
|||||||
@SneakyThrows
|
@SneakyThrows
|
||||||
private void toHtml(ClassificationDocument document, String filename) {
|
private void toHtml(ClassificationDocument document, String filename) {
|
||||||
|
|
||||||
var tables = document.getSections().stream().flatMap(paragraph -> paragraph.getTables().stream()).toList();
|
var tables = document.getSections()
|
||||||
|
.stream()
|
||||||
|
.flatMap(paragraph -> paragraph.getTables()
|
||||||
|
.stream())
|
||||||
|
.toList();
|
||||||
StringBuilder sb = new StringBuilder();
|
StringBuilder sb = new StringBuilder();
|
||||||
|
|
||||||
int currentPage = 1;
|
int currentPage = 1;
|
||||||
@ -823,9 +843,19 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
|||||||
|
|
||||||
private void validateTable(ClassificationDocument document, int tableIndex, int colCount, int rowCount, int emptyCellsCountCorrect, int emptyCellsCountIncorrect) {
|
private void validateTable(ClassificationDocument document, int tableIndex, int colCount, int rowCount, int emptyCellsCountCorrect, int emptyCellsCountIncorrect) {
|
||||||
|
|
||||||
TablePageBlock table = document.getSections().stream().flatMap(paragraph -> paragraph.getTables().stream()).toList().get(tableIndex);
|
TablePageBlock table = document.getSections()
|
||||||
|
.stream()
|
||||||
|
.flatMap(paragraph -> paragraph.getTables()
|
||||||
|
.stream())
|
||||||
|
.toList()
|
||||||
|
.get(tableIndex);
|
||||||
List<List<Cell>> rows = table.getRows();
|
List<List<Cell>> rows = table.getRows();
|
||||||
int emptyCellsFoundFound = rows.stream().flatMap(List::stream).toList().stream().filter(f -> f.toString().isEmpty()).toList().size();
|
int emptyCellsFoundFound = rows.stream()
|
||||||
|
.flatMap(List::stream)
|
||||||
|
.toList()
|
||||||
|
.stream()
|
||||||
|
.filter(f -> f.toString().isEmpty())
|
||||||
|
.toList().size();
|
||||||
|
|
||||||
for (List<Cell> row : table.getRows()) {
|
for (List<Cell> row : table.getRows()) {
|
||||||
row.forEach(r -> System.out.println(r.toString()));
|
row.forEach(r -> System.out.println(r.toString()));
|
||||||
@ -840,11 +870,20 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
|||||||
|
|
||||||
private void validateTable(ClassificationDocument document, int tableIndex, List<List<String>> values) {
|
private void validateTable(ClassificationDocument document, int tableIndex, List<List<String>> values) {
|
||||||
|
|
||||||
TablePageBlock table = document.getSections().stream().flatMap(paragraph -> paragraph.getTables().stream()).toList().get(tableIndex);
|
TablePageBlock table = document.getSections()
|
||||||
|
.stream()
|
||||||
|
.flatMap(paragraph -> paragraph.getTables()
|
||||||
|
.stream())
|
||||||
|
.toList()
|
||||||
|
.get(tableIndex);
|
||||||
List<List<Cell>> rows = table.getRows();
|
List<List<Cell>> rows = table.getRows();
|
||||||
|
|
||||||
List<Cell> rowsFlattened = rows.stream().flatMap(List::stream).toList();
|
List<Cell> rowsFlattened = rows.stream()
|
||||||
List<String> valuesFlattened = values.stream().flatMap(List::stream).toList();
|
.flatMap(List::stream)
|
||||||
|
.toList();
|
||||||
|
List<String> valuesFlattened = values.stream()
|
||||||
|
.flatMap(List::stream)
|
||||||
|
.toList();
|
||||||
|
|
||||||
for (int i = 0; i < valuesFlattened.size(); i++) {
|
for (int i = 0; i < valuesFlattened.size(); i++) {
|
||||||
Cell cell = rowsFlattened.get(i);
|
Cell cell = rowsFlattened.get(i);
|
||||||
@ -857,7 +896,11 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
|||||||
|
|
||||||
private void validateTableSize(ClassificationDocument document, int tableSize) {
|
private void validateTableSize(ClassificationDocument document, int tableSize) {
|
||||||
|
|
||||||
assertThat(document.getSections().stream().flatMap(paragraph -> paragraph.getTables().stream()).toList().size()).isEqualTo(tableSize);
|
assertThat(document.getSections()
|
||||||
|
.stream()
|
||||||
|
.flatMap(paragraph -> paragraph.getTables()
|
||||||
|
.stream())
|
||||||
|
.toList().size()).isEqualTo(tableSize);
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@ -28,29 +28,30 @@ class InvisibleTableDetectionServiceTest {
|
|||||||
|
|
||||||
String fileName = "files/basf/CustomerFiles/invisible_tables_test-two-pages_ocred.pdf";
|
String fileName = "files/basf/CustomerFiles/invisible_tables_test-two-pages_ocred.pdf";
|
||||||
var tmpFileName = Path.of("/tmp/").resolve(Path.of(fileName).getFileName() + "_TABLE.pdf").toString();
|
var tmpFileName = Path.of("/tmp/").resolve(Path.of(fileName).getFileName() + "_TABLE.pdf").toString();
|
||||||
List<PageInformation> pageContents = PageContentExtractor.getSortedPageContents(fileName).stream().map(PageInformationService::build).collect(Collectors.toList());
|
List<PageInformation> pageContents = PageContentExtractor.getSortedPageContents(fileName)
|
||||||
|
.stream()
|
||||||
|
.map(PageInformationService::build)
|
||||||
|
.collect(Collectors.toList());
|
||||||
|
|
||||||
int pageNumber = 1;
|
int pageNumber = 1;
|
||||||
Rectangle2D tableBBox = pageContents.get(0)
|
Rectangle2D tableBBox = pageContents.get(0).getPageContents().getSortedTextPositionSequences().subList(45, 152)
|
||||||
.getPageContents()
|
|
||||||
.getSortedTextPositionSequences()
|
|
||||||
.subList(45, 152)
|
|
||||||
.stream()
|
.stream()
|
||||||
.map(TextPositionSequence::getRectangle)
|
.map(TextPositionSequence::getBBox)
|
||||||
.map(RectangleTransformations::toRectangle2D)
|
|
||||||
.map(this::mirrorY)
|
.map(this::mirrorY)
|
||||||
.collect(RectangleTransformations.collectBBox());
|
.collect(RectangleTransformations.collectBBox());
|
||||||
|
|
||||||
List<TextPositionSequence> textPositionSequences = pageContents.get(0)
|
List<TextPositionSequence> textPositionSequences = pageContents.get(0).getPageContents().getSortedTextPositionSequences()
|
||||||
.getPageContents()
|
|
||||||
.getSortedTextPositionSequences()
|
|
||||||
.stream()
|
.stream()
|
||||||
.filter(textPositionSequence -> tableBBox.contains(mirrorY(RectangleTransformations.toRectangle2D(textPositionSequence.getRectangle()))))
|
.filter(textPositionSequence -> tableBBox.contains(mirrorY(textPositionSequence.getBBox())))
|
||||||
.toList();
|
.toList();
|
||||||
|
|
||||||
var table = InvisibleTableDetectionService.detectTable(textPositionSequences, tableBBox);
|
var table = InvisibleTableDetectionService.detectTable(textPositionSequences, tableBBox);
|
||||||
|
|
||||||
PdfDraw.drawRectanglesPerPage(fileName, List.of(table.stream().flatMap(Collection::stream).toList(), Collections.emptyList()), tmpFileName);
|
PdfDraw.drawRectanglesPerPage(fileName,
|
||||||
|
List.of(table.stream()
|
||||||
|
.flatMap(Collection::stream)
|
||||||
|
.toList(), Collections.emptyList()),
|
||||||
|
tmpFileName);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@ -29,9 +29,7 @@ class PageContentExtractorTest {
|
|||||||
textPositionPerPage.stream()
|
textPositionPerPage.stream()
|
||||||
.map(t -> t.getSortedTextPositionSequences()
|
.map(t -> t.getSortedTextPositionSequences()
|
||||||
.stream()
|
.stream()
|
||||||
.map(TextPositionSequence::getRectangle)
|
.map(TextPositionSequence::getBBoxInitialUserSpace)
|
||||||
.map(RectangleTransformations::toRectangle2D)
|
|
||||||
//.map(textPositionSequence -> (Rectangle2D) new Rectangle2D.Double(textPositionSequence.getMaxXDirAdj(), textPositionSequence.getMaxYDirAdj(), textPositionSequence.getWidth(), textPositionSequence.getHeight()))
|
|
||||||
.map(List::of)
|
.map(List::of)
|
||||||
.toList())
|
.toList())
|
||||||
.toList(), tmpFileName);
|
.toList(), tmpFileName);
|
||||||
|
|||||||
@ -52,8 +52,8 @@ public class RulingCleaningServiceTest extends BuildDocumentTest {
|
|||||||
RulingCleaningService rulingCleaningService = new RulingCleaningService();
|
RulingCleaningService rulingCleaningService = new RulingCleaningService();
|
||||||
List<List<Rectangle2D>> rectanglesPerPage = new LinkedList<>();
|
List<List<Rectangle2D>> rectanglesPerPage = new LinkedList<>();
|
||||||
for (PageContents pageContent : pageContents) {
|
for (PageContents pageContent : pageContents) {
|
||||||
CleanRulings cleanRulings = rulingCleaningService.getCleanRulings(Collections.emptyList(), pageContent.getRulings());
|
CleanRulings cleanRulings = rulingCleaningService.deduplicateAndStraightenRulings(Collections.emptyList(), pageContent.getRulings());
|
||||||
List<Rectangle2D> rects = RectangularIntersectionFinder.find(cleanRulings.getHorizontal(), cleanRulings.getVertical());
|
List<Rectangle2D> rects = RectangularIntersectionFinder.find(cleanRulings.getHorizontals(), cleanRulings.getVerticals());
|
||||||
rectanglesPerPage.add(rects);
|
rectanglesPerPage.add(rects);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -72,15 +72,16 @@ public class RulingCleaningServiceTest extends BuildDocumentTest {
|
|||||||
RulingCleaningService rulingCleaningService = new RulingCleaningService();
|
RulingCleaningService rulingCleaningService = new RulingCleaningService();
|
||||||
List<CleanRulings> cleanRulingsPerPage = new LinkedList<>();
|
List<CleanRulings> cleanRulingsPerPage = new LinkedList<>();
|
||||||
for (PageContents pageContent : pageContents) {
|
for (PageContents pageContent : pageContents) {
|
||||||
cleanRulingsPerPage.add(rulingCleaningService.getCleanRulings(Collections.emptyList(), pageContent.getRulings()));
|
cleanRulingsPerPage.add(rulingCleaningService.deduplicateAndStraightenRulings(Collections.emptyList(), pageContent.getRulings()));
|
||||||
}
|
}
|
||||||
var cleanRulings = cleanRulingsPerPage.stream().map(CleanRulings::getVertical).collect(Collectors.toList());
|
var cleanRulings = cleanRulingsPerPage.stream().map(CleanRulings::getVerticals).collect(Collectors.toList());
|
||||||
PdfDraw.drawLinesPerPage(fileName, cleanRulings, lineFileName);
|
PdfDraw.drawLinesPerPage(fileName, cleanRulings, lineFileName);
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
|
@Disabled
|
||||||
@SneakyThrows
|
@SneakyThrows
|
||||||
public void testTableExtraction() {
|
public void testTableExtraction() {
|
||||||
|
|
||||||
@ -98,6 +99,7 @@ public class RulingCleaningServiceTest extends BuildDocumentTest {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
@SneakyThrows
|
@SneakyThrows
|
||||||
private void writeJsons(Path filename) {
|
private void writeJsons(Path filename) {
|
||||||
|
|
||||||
|
|||||||
@ -0,0 +1,84 @@
|
|||||||
|
package com.knecon.fforesight.service.layoutparser.server.services;
|
||||||
|
|
||||||
|
import static org.junit.jupiter.api.Assertions.assertEquals;
|
||||||
|
import static org.junit.jupiter.api.Assertions.assertTrue;
|
||||||
|
|
||||||
|
import java.util.Collections;
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
import org.junit.jupiter.api.Test;
|
||||||
|
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.model.PageContents;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.model.table.Ruling;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.services.PageContentExtractor;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.services.RulingCleaningService;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.services.TextRulingsClassifier;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.utils.RectangularIntersectionFinder;
|
||||||
|
|
||||||
|
import lombok.SneakyThrows;
|
||||||
|
|
||||||
|
public class RulingsClassifierTest {
|
||||||
|
|
||||||
|
@Test
|
||||||
|
@SneakyThrows
|
||||||
|
public void textRulingExtractionTest() {
|
||||||
|
|
||||||
|
String fileName = "files/Minimal Examples/RotateTextWithRulingsTestFile.pdf";
|
||||||
|
List<PageContents> pageContents = PageContentExtractor.getSortedPageContents(fileName);
|
||||||
|
RulingCleaningService rulingCleaningService = new RulingCleaningService();
|
||||||
|
|
||||||
|
for (PageContents pageContent : pageContents) {
|
||||||
|
CleanRulings cleanRulings = rulingCleaningService.deduplicateAndStraightenRulings(Collections.emptyList(), pageContent.getRulings());
|
||||||
|
RectangularIntersectionFinder.find(cleanRulings.getHorizontals(), cleanRulings.getVerticals());
|
||||||
|
TextRulingsClassifier.classifyUnderlinedAndStrikethroughText(pageContent.getSortedTextPositionSequences(), cleanRulings);
|
||||||
|
|
||||||
|
assertTrue(pageContent.getSortedTextPositionSequences()
|
||||||
|
.stream()
|
||||||
|
.filter(word -> word.toString().equals("Underlined"))
|
||||||
|
.allMatch(TextPositionSequence::isUnderline));
|
||||||
|
assertTrue(pageContent.getSortedTextPositionSequences()
|
||||||
|
.stream()
|
||||||
|
.filter(word -> word.toString().equals("Striketrough"))
|
||||||
|
.allMatch(TextPositionSequence::isStrikethrough));
|
||||||
|
|
||||||
|
assertEquals(4,
|
||||||
|
cleanRulings.buildAll()
|
||||||
|
.stream()
|
||||||
|
.filter(ruling -> ruling.getClassification().equals(Ruling.Classification.STRIKETROUGH))
|
||||||
|
.count());
|
||||||
|
assertEquals(4,
|
||||||
|
cleanRulings.buildAll()
|
||||||
|
.stream()
|
||||||
|
.filter(ruling -> ruling.getClassification().equals(Ruling.Classification.UNDERLINE))
|
||||||
|
.count());
|
||||||
|
assertEquals(0, cleanRulings.withoutTextRulings().buildAll().size());
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@Test
|
||||||
|
@SneakyThrows
|
||||||
|
public void tableRulingExtractionTest() {
|
||||||
|
|
||||||
|
String fileName = "files/SinglePages/AbsolutelyEnormousTable.pdf";
|
||||||
|
List<PageContents> pageContents = PageContentExtractor.getSortedPageContents(fileName);
|
||||||
|
RulingCleaningService rulingCleaningService = new RulingCleaningService();
|
||||||
|
|
||||||
|
for (PageContents pageContent : pageContents) {
|
||||||
|
CleanRulings cleanRulings = rulingCleaningService.deduplicateAndStraightenRulings(Collections.emptyList(), pageContent.getRulings());
|
||||||
|
RectangularIntersectionFinder.find(cleanRulings.getHorizontals(), cleanRulings.getVerticals());
|
||||||
|
TextRulingsClassifier.classifyUnderlinedAndStrikethroughText(pageContent.getSortedTextPositionSequences(), cleanRulings);
|
||||||
|
|
||||||
|
assertEquals(30, cleanRulings.getHorizontals().size());
|
||||||
|
assertEquals(30, cleanRulings.getTableLines().getHorizontals().size());
|
||||||
|
|
||||||
|
assertEquals(144, cleanRulings.getVerticals().size());
|
||||||
|
assertEquals(144, cleanRulings.getTableLines().getVerticals().size());
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
@ -1,6 +1,9 @@
|
|||||||
package com.knecon.fforesight.service.layoutparser.server.utils;
|
package com.knecon.fforesight.service.layoutparser.server.utils;
|
||||||
|
|
||||||
|
import java.io.File;
|
||||||
|
import java.io.FileInputStream;
|
||||||
import java.io.InputStream;
|
import java.io.InputStream;
|
||||||
|
import java.nio.file.Path;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
import java.util.Optional;
|
import java.util.Optional;
|
||||||
|
|
||||||
@ -102,29 +105,22 @@ public abstract class AbstractTest {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@SneakyThrows
|
protected LayoutParsingRequest buildDefaultLayoutParsingRequest(String fileName, LayoutParsingType layoutParsingType, boolean debug) {
|
||||||
protected LayoutParsingRequest prepareStorage(InputStream fileInputStream) {
|
|
||||||
|
|
||||||
storageService.storeObject(TenantContext.getTenantId(), ORIGIN_FILE_ID, fileInputStream);
|
|
||||||
return buildDefaultLayoutParsingRequest(LayoutParsingType.REDACT_MANAGER_OLD);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
protected LayoutParsingRequest buildDefaultLayoutParsingRequest(LayoutParsingType layoutParsingType) {
|
|
||||||
|
|
||||||
|
var identifier = debug ? Map.of("fileId", fileName, "debug", "true") : Map.of("fileId", fileName);
|
||||||
return LayoutParsingRequest.builder()
|
return LayoutParsingRequest.builder()
|
||||||
.identifier(Map.of("fileId", "1337"))
|
.identifier(identifier)
|
||||||
.layoutParsingType(layoutParsingType)
|
.layoutParsingType(layoutParsingType)
|
||||||
.originFileStorageId(ORIGIN_FILE_ID)
|
.originFileStorageId(fileName + ORIGIN_FILE_ID)
|
||||||
.tablesFileStorageId(Optional.of(TABLE_FILE_ID))
|
.tablesFileStorageId(Optional.of(fileName + TABLE_FILE_ID))
|
||||||
.imagesFileStorageId(Optional.of(IMAGE_FILE_ID))
|
.imagesFileStorageId(Optional.of(fileName + IMAGE_FILE_ID))
|
||||||
.visualLayoutParsingFileId(Optional.of(VISUAL_LAYOUT_FILE))
|
.visualLayoutParsingFileId(Optional.empty())
|
||||||
.structureFileStorageId(STRUCTURE_FILE_ID)
|
.structureFileStorageId(fileName + STRUCTURE_FILE_ID)
|
||||||
.textBlockFileStorageId(TEXT_FILE_ID)
|
.textBlockFileStorageId(fileName + TEXT_FILE_ID)
|
||||||
.positionBlockFileStorageId(POSITION_FILE_ID)
|
.positionBlockFileStorageId(fileName + POSITION_FILE_ID)
|
||||||
.pageFileStorageId(PAGES_FILE_ID)
|
.pageFileStorageId(fileName + PAGES_FILE_ID)
|
||||||
.simplifiedTextStorageId(SIMPLIFIED_ID)
|
.simplifiedTextStorageId(fileName + SIMPLIFIED_ID)
|
||||||
.viewerDocumentStorageId(VIEWER_DOCUMENT_ID)
|
.viewerDocumentStorageId(fileName + VIEWER_DOCUMENT_ID)
|
||||||
.build();
|
.build();
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -148,10 +144,28 @@ public abstract class AbstractTest {
|
|||||||
ClassPathResource imageInfoFileResource = new ClassPathResource(imageInfoFile);
|
ClassPathResource imageInfoFileResource = new ClassPathResource(imageInfoFile);
|
||||||
ClassPathResource visualLayoutParsingResponseResource = new ClassPathResource(visualLayoutParsingResponseFile);
|
ClassPathResource visualLayoutParsingResponseResource = new ClassPathResource(visualLayoutParsingResponseFile);
|
||||||
|
|
||||||
return prepareStorage(pdfFileResource.getInputStream(),
|
return prepareStorage(Path.of(file).getFileName().toString(),
|
||||||
cvServiceResponseFileResource.getInputStream(),
|
pdfFileResource.getInputStream(),
|
||||||
imageInfoFileResource.getInputStream(),
|
cvServiceResponseFileResource.getInputStream(),
|
||||||
visualLayoutParsingResponseResource.getInputStream());
|
imageInfoFileResource.getInputStream(),
|
||||||
|
visualLayoutParsingResponseResource.getInputStream());
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@SneakyThrows
|
||||||
|
protected void prepareStorage(LayoutParsingRequest layoutParsingRequest, File file) {
|
||||||
|
|
||||||
|
ClassPathResource cvServiceResponseFileResource = new ClassPathResource("cv_table_parsing_response/empty.json");
|
||||||
|
ClassPathResource imageInfoFileResource = new ClassPathResource("image_service_response/empty.json");
|
||||||
|
ClassPathResource visualLayoutParsingResponseResource = new ClassPathResource("visual_layout_parsing_response/empty.json");
|
||||||
|
|
||||||
|
try (var in = new FileInputStream(file)) {
|
||||||
|
prepareStorage(layoutParsingRequest,
|
||||||
|
in,
|
||||||
|
cvServiceResponseFileResource.getInputStream(),
|
||||||
|
imageInfoFileResource.getInputStream(),
|
||||||
|
visualLayoutParsingResponseResource.getInputStream());
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@ -162,12 +176,29 @@ public abstract class AbstractTest {
|
|||||||
storageService.storeObject(TenantContext.getTenantId(), TABLE_FILE_ID, cvServiceResponseFileStream);
|
storageService.storeObject(TenantContext.getTenantId(), TABLE_FILE_ID, cvServiceResponseFileStream);
|
||||||
storageService.storeObject(TenantContext.getTenantId(), ORIGIN_FILE_ID, fileStream);
|
storageService.storeObject(TenantContext.getTenantId(), ORIGIN_FILE_ID, fileStream);
|
||||||
|
|
||||||
return buildDefaultLayoutParsingRequest(LayoutParsingType.REDACT_MANAGER_OLD);
|
return buildDefaultLayoutParsingRequest("test", LayoutParsingType.REDACT_MANAGER_OLD, true);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@SneakyThrows
|
@SneakyThrows
|
||||||
protected LayoutParsingRequest prepareStorage(InputStream fileStream,
|
protected void prepareStorage(LayoutParsingRequest layoutParsingRequest,
|
||||||
|
InputStream fileStream,
|
||||||
|
InputStream cvServiceResponseFileStream,
|
||||||
|
InputStream imageInfoStream,
|
||||||
|
InputStream visualLayoutParsingResponseFileStream) {
|
||||||
|
|
||||||
|
storageService.storeObject(TenantContext.getTenantId(), layoutParsingRequest.imagesFileStorageId().get(), imageInfoStream);
|
||||||
|
storageService.storeObject(TenantContext.getTenantId(), layoutParsingRequest.tablesFileStorageId().get(), cvServiceResponseFileStream);
|
||||||
|
storageService.storeObject(TenantContext.getTenantId(), layoutParsingRequest.originFileStorageId(), fileStream);
|
||||||
|
if (layoutParsingRequest.visualLayoutParsingFileId().isPresent()) {
|
||||||
|
storageService.storeObject(TenantContext.getTenantId(), layoutParsingRequest.visualLayoutParsingFileId().get(), visualLayoutParsingResponseFileStream);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@SneakyThrows
|
||||||
|
protected LayoutParsingRequest prepareStorage(String fileName,
|
||||||
|
InputStream fileStream,
|
||||||
InputStream cvServiceResponseFileStream,
|
InputStream cvServiceResponseFileStream,
|
||||||
InputStream imageInfoStream,
|
InputStream imageInfoStream,
|
||||||
InputStream visualLayoutParsingResponseFileStream) {
|
InputStream visualLayoutParsingResponseFileStream) {
|
||||||
@ -177,7 +208,7 @@ public abstract class AbstractTest {
|
|||||||
storageService.storeObject(TenantContext.getTenantId(), ORIGIN_FILE_ID, fileStream);
|
storageService.storeObject(TenantContext.getTenantId(), ORIGIN_FILE_ID, fileStream);
|
||||||
storageService.storeObject(TenantContext.getTenantId(), VISUAL_LAYOUT_FILE, visualLayoutParsingResponseFileStream);
|
storageService.storeObject(TenantContext.getTenantId(), VISUAL_LAYOUT_FILE, visualLayoutParsingResponseFileStream);
|
||||||
|
|
||||||
return buildDefaultLayoutParsingRequest(LayoutParsingType.REDACT_MANAGER_OLD);
|
return buildDefaultLayoutParsingRequest(fileName, LayoutParsingType.REDACT_MANAGER_OLD, true);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@ -1,11 +1,13 @@
|
|||||||
package com.knecon.fforesight.service.layoutparser.server.utils;
|
package com.knecon.fforesight.service.layoutparser.server.utils;
|
||||||
|
|
||||||
import java.io.File;
|
import java.io.File;
|
||||||
|
import java.nio.file.Path;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
|
|
||||||
import org.springframework.beans.factory.annotation.Autowired;
|
import org.springframework.beans.factory.annotation.Autowired;
|
||||||
import org.springframework.core.io.ClassPathResource;
|
import org.springframework.core.io.ClassPathResource;
|
||||||
|
|
||||||
|
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingRequest;
|
||||||
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType;
|
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.LayoutParsingPipeline;
|
import com.knecon.fforesight.service.layoutparser.processor.LayoutParsingPipeline;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationDocument;
|
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationDocument;
|
||||||
@ -28,11 +30,11 @@ public abstract class BuildDocumentTest extends AbstractTest {
|
|||||||
File fileResource = new ClassPathResource(filename).getFile();
|
File fileResource = new ClassPathResource(filename).getFile();
|
||||||
prepareStorage(filename);
|
prepareStorage(filename);
|
||||||
return layoutParsingPipeline.parseLayout(layoutParsingType,
|
return layoutParsingPipeline.parseLayout(layoutParsingType,
|
||||||
fileResource,
|
fileResource,
|
||||||
layoutParsingStorageService.getImagesFile(IMAGE_FILE_ID),
|
layoutParsingStorageService.getImagesFile(IMAGE_FILE_ID),
|
||||||
new TableServiceResponse(),
|
new TableServiceResponse(),
|
||||||
new VisualLayoutParsingResponse(),
|
new VisualLayoutParsingResponse(),
|
||||||
Map.of("file",filename));
|
Map.of("file", filename, "debug", "true"));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@ -46,13 +48,25 @@ public abstract class BuildDocumentTest extends AbstractTest {
|
|||||||
@SneakyThrows
|
@SneakyThrows
|
||||||
protected Document buildGraph(String filename, LayoutParsingType layoutParsingType) {
|
protected Document buildGraph(String filename, LayoutParsingType layoutParsingType) {
|
||||||
|
|
||||||
if (filename.equals("files/syngenta/CustomerFiles/S-Metolachlor_RAR_01_Volume_1_2018-09-06.pdf")) {
|
if (!filename.startsWith("files") && filename.startsWith("/")) {
|
||||||
prepareStorage(filename, "cv_table_parsing_response/empty.json", "image_service_response/S-Metolachlor_RAR_01_Volume_1_2018-09-06.IMAGE_INFO.json");
|
LayoutParsingRequest layoutParsingRequest = buildDefaultLayoutParsingRequest(Path.of(filename).getFileName().toString(), LayoutParsingType.REDACT_MANAGER, true);
|
||||||
|
prepareStorage(layoutParsingRequest, new File(filename));
|
||||||
|
return DocumentGraphFactory.buildDocumentGraph(layoutParsingType,
|
||||||
|
layoutParsingPipeline.parseLayout(layoutParsingType,
|
||||||
|
new File(filename),
|
||||||
|
layoutParsingStorageService.getImagesFile(layoutParsingRequest.imagesFileStorageId().get()),
|
||||||
|
new TableServiceResponse(),
|
||||||
|
new VisualLayoutParsingResponse(),
|
||||||
|
layoutParsingRequest.identifier()));
|
||||||
} else {
|
} else {
|
||||||
prepareStorage(filename);
|
if (filename.equals("files/S-Metolachlor_RAR_01_Volume_1_2018-09-06.pdf")) {
|
||||||
|
prepareStorage(filename, "cv_table_parsing_response/empty.json", "image_service_response/S-Metolachlor_RAR_01_Volume_1_2018-09-06.IMAGE_INFO.json");
|
||||||
|
} else {
|
||||||
|
prepareStorage(filename);
|
||||||
|
}
|
||||||
|
return DocumentGraphFactory.buildDocumentGraph(layoutParsingType, parseLayout(filename, layoutParsingType));
|
||||||
}
|
}
|
||||||
|
|
||||||
return DocumentGraphFactory.buildDocumentGraph(layoutParsingType, parseLayout(filename, layoutParsingType));
|
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|||||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
@ -26,6 +26,26 @@ public class ContentStreams {
|
|||||||
|
|
||||||
public static Identifier ESCAPE_END = new Identifier("escape start", COSName.getPDFName("ESCAPE_END"), false);
|
public static Identifier ESCAPE_END = new Identifier("escape start", COSName.getPDFName("ESCAPE_END"), false);
|
||||||
|
|
||||||
|
public static Identifier CLEAN_RULINGS = new Identifier("Cleaned Rulings", COSName.getPDFName("KNECON_CLEAN_RULINGS"), true);
|
||||||
|
|
||||||
|
public static Identifier RULINGS = new Identifier("Rulings", COSName.getPDFName("KNECON_RULINGS"), true);
|
||||||
|
|
||||||
|
public static Identifier WORDS = new Identifier("Words", COSName.getPDFName("KNECON_WORDS"), true);
|
||||||
|
|
||||||
|
public static Identifier ZONES = new Identifier("Text Zones", COSName.getPDFName("KNECON_ZONES"), true);
|
||||||
|
|
||||||
|
public static Identifier LINES = new Identifier("Text Lines", COSName.getPDFName("KNECON_LINES"), true);
|
||||||
|
|
||||||
|
public static Identifier CELLS = new Identifier("Cells", COSName.getPDFName("KNECON_CELLS"), true);
|
||||||
|
|
||||||
|
public static Identifier MAIN_BODY = new Identifier("Main Text Body", COSName.getPDFName("KNECON_MAIN_BODY"), true);
|
||||||
|
|
||||||
|
public static Identifier MARKED_CONTENT = new Identifier("Marked content", COSName.getPDFName("KNECON_MARKED_CONTENT"), true);
|
||||||
|
|
||||||
|
public static Identifier NEIGHBOURS = new Identifier("Neighbours", COSName.getPDFName("KNECON_NEIGHBOURS"), true);
|
||||||
|
|
||||||
|
public static Identifier CHARACTERS = new Identifier("Characters", COSName.getPDFName("KNECON_CHARACTERS"), true);
|
||||||
|
|
||||||
public static List<Identifier> allContentStreams = List.of(KNECON_LAYOUT,
|
public static List<Identifier> allContentStreams = List.of(KNECON_LAYOUT,
|
||||||
KNECON_VISUAL_PARSING,
|
KNECON_VISUAL_PARSING,
|
||||||
KNECON_OCR,
|
KNECON_OCR,
|
||||||
@ -33,7 +53,17 @@ public class ContentStreams {
|
|||||||
KNECON_OCR_TEXT_DEBUG,
|
KNECON_OCR_TEXT_DEBUG,
|
||||||
OTHER,
|
OTHER,
|
||||||
ESCAPE_START,
|
ESCAPE_START,
|
||||||
ESCAPE_END);
|
ESCAPE_END,
|
||||||
|
RULINGS,
|
||||||
|
CLEAN_RULINGS,
|
||||||
|
WORDS,
|
||||||
|
ZONES,
|
||||||
|
LINES,
|
||||||
|
MAIN_BODY,
|
||||||
|
MARKED_CONTENT,
|
||||||
|
NEIGHBOURS,
|
||||||
|
CHARACTERS,
|
||||||
|
CELLS);
|
||||||
|
|
||||||
public record Identifier(String name, COSName cosName, boolean optionalContent) {
|
public record Identifier(String name, COSName cosName, boolean optionalContent) {
|
||||||
|
|
||||||
|
|||||||
@ -1,5 +1,6 @@
|
|||||||
package com.knecon.fforesight.service.viewerdoc.model;
|
package com.knecon.fforesight.service.viewerdoc.model;
|
||||||
|
|
||||||
|
import java.util.LinkedHashMap;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
|
|
||||||
import com.knecon.fforesight.service.viewerdoc.ContentStreams;
|
import com.knecon.fforesight.service.viewerdoc.ContentStreams;
|
||||||
@ -17,7 +18,8 @@ import lombok.experimental.FieldDefaults;
|
|||||||
public class Visualizations {
|
public class Visualizations {
|
||||||
|
|
||||||
ContentStreams.Identifier layer;
|
ContentStreams.Identifier layer;
|
||||||
Map<Integer, VisualizationsOnPage> visualizationsOnPages;
|
@Builder.Default
|
||||||
|
Map<Integer, VisualizationsOnPage> visualizationsOnPages = new LinkedHashMap<>();
|
||||||
boolean layerVisibilityDefaultValue;
|
boolean layerVisibilityDefaultValue;
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|||||||
@ -53,12 +53,6 @@ public class ViewerDocumentService {
|
|||||||
private final ObservationRegistry registry;
|
private final ObservationRegistry registry;
|
||||||
|
|
||||||
|
|
||||||
public void addVisualizationsOnPage(File originFile, File destinationFile, Visualizations visualizations) {
|
|
||||||
|
|
||||||
addVisualizationsOnPage(originFile, destinationFile, List.of(visualizations));
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
@Observed(name = "ViewerDocumentService", contextualName = "add-visualizations")
|
@Observed(name = "ViewerDocumentService", contextualName = "add-visualizations")
|
||||||
@SneakyThrows
|
@SneakyThrows
|
||||||
public void addVisualizationsOnPage(File originFile, File destinationFile, List<Visualizations> visualizations) {
|
public void addVisualizationsOnPage(File originFile, File destinationFile, List<Visualizations> visualizations) {
|
||||||
@ -70,9 +64,14 @@ public class ViewerDocumentService {
|
|||||||
|
|
||||||
PDDocument pdDocument = openPDDocument(tmpFile.toFile());
|
PDDocument pdDocument = openPDDocument(tmpFile.toFile());
|
||||||
|
|
||||||
enrichObservation(pdDocument, visualizations.stream().map(Visualizations::getLayer).toList());
|
enrichObservation(pdDocument,
|
||||||
|
visualizations.stream()
|
||||||
|
.map(Visualizations::getLayer)
|
||||||
|
.toList());
|
||||||
|
|
||||||
Set<ContentStreams.Identifier> allLayers = visualizations.stream().map(Visualizations::getLayer).collect(Collectors.toUnmodifiableSet());
|
Set<ContentStreams.Identifier> allLayers = visualizations.stream()
|
||||||
|
.map(Visualizations::getLayer)
|
||||||
|
.collect(Collectors.toUnmodifiableSet());
|
||||||
|
|
||||||
Map<ContentStreams.Identifier, PDOptionalContentGroup> optionalContentGroupMap = addLayersToDocument(visualizations, pdDocument);
|
Map<ContentStreams.Identifier, PDOptionalContentGroup> optionalContentGroupMap = addLayersToDocument(visualizations, pdDocument);
|
||||||
|
|
||||||
@ -229,11 +228,11 @@ public class ViewerDocumentService {
|
|||||||
Matrix textMatrix;
|
Matrix textMatrix;
|
||||||
if (placedText.textMatrix().isEmpty()) {
|
if (placedText.textMatrix().isEmpty()) {
|
||||||
textMatrix = new Matrix((float) textDeRotationMatrix.getScaleX(),
|
textMatrix = new Matrix((float) textDeRotationMatrix.getScaleX(),
|
||||||
(float) textDeRotationMatrix.getShearX(),
|
(float) textDeRotationMatrix.getShearX(),
|
||||||
(float) textDeRotationMatrix.getShearY(),
|
(float) textDeRotationMatrix.getShearY(),
|
||||||
(float) textDeRotationMatrix.getScaleY(),
|
(float) textDeRotationMatrix.getScaleY(),
|
||||||
(float) placedText.lineStart().getX(),
|
(float) placedText.lineStart().getX(),
|
||||||
(float) placedText.lineStart().getY());
|
(float) placedText.lineStart().getY());
|
||||||
} else {
|
} else {
|
||||||
textMatrix = placedText.textMatrix().get();
|
textMatrix = placedText.textMatrix().get();
|
||||||
}
|
}
|
||||||
|
|||||||
@ -12,4 +12,4 @@ commit_hash=$(git rev-parse --short=5 HEAD)
|
|||||||
buildName="${USER}-${branch}-${commit_hash}"
|
buildName="${USER}-${branch}-${commit_hash}"
|
||||||
|
|
||||||
gradle bootBuildImage --publishImage -PbuildbootDockerHostNetwork=true -Pversion=$buildName --no-build-cache
|
gradle bootBuildImage --publishImage -PbuildbootDockerHostNetwork=true -Pversion=$buildName --no-build-cache
|
||||||
echo "nexus.knecon.com:5001/ff/${dir}-service-server:$buildName"
|
echo "nexus.knecon.com:5001/ff/layoutparser-service-server:$buildName"
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user