Compare commits
30 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
0c8b2e6d44 | ||
|
|
b08ed2037e | ||
|
|
b50bfed69d | ||
|
|
49f13d1f03 | ||
|
|
61c90fc30d | ||
|
|
6a0661cf09 | ||
|
|
2d33615b94 | ||
|
|
1856fed640 | ||
|
|
2fcaeb3d8c | ||
|
|
4e07ba4ff1 | ||
|
|
cfb6f0acfa | ||
|
|
a9338262c5 | ||
|
|
d2dc369df3 | ||
|
|
f7aeb9a406 | ||
|
|
9bf2f5c56c | ||
|
|
c071a133e6 | ||
|
|
9f9ea68706 | ||
|
|
85e3cf0ecc | ||
|
|
17756f5977 | ||
|
|
59d9d6c3e6 | ||
|
|
c888746761 | ||
|
|
7279d0a870 | ||
|
|
c84a199f9d | ||
|
|
09148960cf | ||
|
|
77ee8dd5bd | ||
|
|
e9d1bdc94f | ||
|
|
894355c7cd | ||
|
|
ca35feeb63 | ||
|
|
a32a43fc62 | ||
|
|
7f675b41cf |
@ -6,6 +6,7 @@ import java.util.Locale;
|
|||||||
public enum NodeType implements Serializable {
|
public enum NodeType implements Serializable {
|
||||||
DOCUMENT,
|
DOCUMENT,
|
||||||
SECTION,
|
SECTION,
|
||||||
|
SUPER_SECTION,
|
||||||
HEADLINE,
|
HEADLINE,
|
||||||
PARAGRAPH,
|
PARAGRAPH,
|
||||||
TABLE,
|
TABLE,
|
||||||
|
|||||||
@ -2,6 +2,7 @@ package com.knecon.fforesight.service.layoutparser.processor;
|
|||||||
|
|
||||||
import static java.lang.String.format;
|
import static java.lang.String.format;
|
||||||
|
|
||||||
|
import java.awt.geom.Point2D;
|
||||||
import java.awt.geom.Rectangle2D;
|
import java.awt.geom.Rectangle2D;
|
||||||
import java.io.File;
|
import java.io.File;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
@ -29,6 +30,11 @@ import com.knecon.fforesight.service.layoutparser.processor.model.Classification
|
|||||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document;
|
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.ImageType;
|
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.ImageType;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.image.ClassifiedImage;
|
import com.knecon.fforesight.service.layoutparser.processor.model.image.ClassifiedImage;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.model.outline.OutlineExtractorService;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.model.outline.OutlineObject;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.model.outline.OutlineValidationService;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.model.outline.TOCEnrichmentService;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.model.outline.TableOfContents;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.Cell;
|
import com.knecon.fforesight.service.layoutparser.processor.model.table.Cell;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings;
|
import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
|
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
|
||||||
@ -45,6 +51,7 @@ import com.knecon.fforesight.service.layoutparser.processor.services.RulingClean
|
|||||||
import com.knecon.fforesight.service.layoutparser.processor.services.SectionsBuilderService;
|
import com.knecon.fforesight.service.layoutparser.processor.services.SectionsBuilderService;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.services.SimplifiedSectionTextService;
|
import com.knecon.fforesight.service.layoutparser.processor.services.SimplifiedSectionTextService;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.services.TableExtractionService;
|
import com.knecon.fforesight.service.layoutparser.processor.services.TableExtractionService;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.services.blockification.BlockificationPostprocessingService;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.services.TextRulingsClassifier;
|
import com.knecon.fforesight.service.layoutparser.processor.services.TextRulingsClassifier;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.services.blockification.DocstrumBlockificationService;
|
import com.knecon.fforesight.service.layoutparser.processor.services.blockification.DocstrumBlockificationService;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.services.blockification.DocuMineBlockificationService;
|
import com.knecon.fforesight.service.layoutparser.processor.services.blockification.DocuMineBlockificationService;
|
||||||
@ -90,12 +97,16 @@ public class LayoutParsingPipeline {
|
|||||||
TableExtractionService tableExtractionService;
|
TableExtractionService tableExtractionService;
|
||||||
DocuMineBlockificationService docuMineBlockificationService;
|
DocuMineBlockificationService docuMineBlockificationService;
|
||||||
RedactManagerBlockificationService redactManagerBlockificationService;
|
RedactManagerBlockificationService redactManagerBlockificationService;
|
||||||
|
BlockificationPostprocessingService blockificationPostprocessingService;
|
||||||
DocstrumBlockificationService docstrumBlockificationService;
|
DocstrumBlockificationService docstrumBlockificationService;
|
||||||
LayoutGridService layoutGridService;
|
LayoutGridService layoutGridService;
|
||||||
ObservationRegistry observationRegistry;
|
ObservationRegistry observationRegistry;
|
||||||
VisualLayoutParsingAdapter visualLayoutParsingAdapter;
|
VisualLayoutParsingAdapter visualLayoutParsingAdapter;
|
||||||
ClarifyndClassificationService clarifyndClassificationService;
|
ClarifyndClassificationService clarifyndClassificationService;
|
||||||
GraphicExtractorService graphicExtractorService;
|
GraphicExtractorService graphicExtractorService;
|
||||||
|
OutlineExtractorService outlineExtractorService;
|
||||||
|
OutlineValidationService outlineValidationService;
|
||||||
|
TOCEnrichmentService tocEnrichmentService;
|
||||||
LayoutparserSettings settings;
|
LayoutparserSettings settings;
|
||||||
|
|
||||||
|
|
||||||
@ -105,21 +116,28 @@ public class LayoutParsingPipeline {
|
|||||||
log.info("Starting layout parsing for {}", layoutParsingRequest.identifier());
|
log.info("Starting layout parsing for {}", layoutParsingRequest.identifier());
|
||||||
|
|
||||||
File originFile = layoutParsingStorageService.getOriginFile(layoutParsingRequest.originFileStorageId());
|
File originFile = layoutParsingStorageService.getOriginFile(layoutParsingRequest.originFileStorageId());
|
||||||
File viewerDocumentFile = layoutParsingStorageService.getViewerDocFile(layoutParsingRequest.viewerDocumentStorageId()).orElse(originFile);
|
File viewerDocumentFile = layoutParsingStorageService.getViewerDocFile(layoutParsingRequest.viewerDocumentStorageId())
|
||||||
|
.orElse(originFile);
|
||||||
|
|
||||||
VisualLayoutParsingResponse visualLayoutParsingResponse = new VisualLayoutParsingResponse();
|
VisualLayoutParsingResponse visualLayoutParsingResponse = new VisualLayoutParsingResponse();
|
||||||
if (layoutParsingRequest.visualLayoutParsingFileId().isPresent()) {
|
if (layoutParsingRequest.visualLayoutParsingFileId()
|
||||||
visualLayoutParsingResponse = layoutParsingStorageService.getVisualLayoutParsingFile(layoutParsingRequest.visualLayoutParsingFileId().get());
|
.isPresent()) {
|
||||||
|
visualLayoutParsingResponse = layoutParsingStorageService.getVisualLayoutParsingFile(layoutParsingRequest.visualLayoutParsingFileId()
|
||||||
|
.get());
|
||||||
}
|
}
|
||||||
|
|
||||||
ImageServiceResponse imageServiceResponse = new ImageServiceResponse();
|
ImageServiceResponse imageServiceResponse = new ImageServiceResponse();
|
||||||
if (layoutParsingRequest.imagesFileStorageId().isPresent()) {
|
if (layoutParsingRequest.imagesFileStorageId()
|
||||||
imageServiceResponse = layoutParsingStorageService.getImagesFile(layoutParsingRequest.imagesFileStorageId().get());
|
.isPresent()) {
|
||||||
|
imageServiceResponse = layoutParsingStorageService.getImagesFile(layoutParsingRequest.imagesFileStorageId()
|
||||||
|
.get());
|
||||||
}
|
}
|
||||||
|
|
||||||
TableServiceResponse tableServiceResponse = new TableServiceResponse();
|
TableServiceResponse tableServiceResponse = new TableServiceResponse();
|
||||||
if (layoutParsingRequest.tablesFileStorageId().isPresent()) {
|
if (layoutParsingRequest.tablesFileStorageId()
|
||||||
tableServiceResponse = layoutParsingStorageService.getTablesFile(layoutParsingRequest.tablesFileStorageId().get());
|
.isPresent()) {
|
||||||
|
tableServiceResponse = layoutParsingStorageService.getTablesFile(layoutParsingRequest.tablesFileStorageId()
|
||||||
|
.get());
|
||||||
}
|
}
|
||||||
|
|
||||||
ClassificationDocument classificationDocument = parseLayout(settings.getLayoutParsingTypeOverride() == null //
|
ClassificationDocument classificationDocument = parseLayout(settings.getLayoutParsingTypeOverride() == null //
|
||||||
@ -199,15 +217,15 @@ public class LayoutParsingPipeline {
|
|||||||
|
|
||||||
private String buildSemanticNodeCountMessage(int numberOfPages, Map<NodeType, Long> semanticNodeCounts) {
|
private String buildSemanticNodeCountMessage(int numberOfPages, Map<NodeType, Long> semanticNodeCounts) {
|
||||||
|
|
||||||
return String.format("%d pages with %d sections, %d headlines, %d paragraphs, %d tables with %d cells, %d headers, and %d footers parsed",
|
return format("%d pages with %d sections, %d headlines, %d paragraphs, %d tables with %d cells, %d headers, and %d footers parsed",
|
||||||
numberOfPages,
|
numberOfPages,
|
||||||
semanticNodeCounts.get(NodeType.SECTION) == null ? 0 : semanticNodeCounts.get(NodeType.SECTION),
|
semanticNodeCounts.get(NodeType.SECTION) == null ? 0 : semanticNodeCounts.get(NodeType.SECTION),
|
||||||
semanticNodeCounts.get(NodeType.HEADLINE) == null ? 0 : semanticNodeCounts.get(NodeType.HEADLINE),
|
semanticNodeCounts.get(NodeType.HEADLINE) == null ? 0 : semanticNodeCounts.get(NodeType.HEADLINE),
|
||||||
semanticNodeCounts.get(NodeType.PARAGRAPH) == null ? 0 : semanticNodeCounts.get(NodeType.PARAGRAPH),
|
semanticNodeCounts.get(NodeType.PARAGRAPH) == null ? 0 : semanticNodeCounts.get(NodeType.PARAGRAPH),
|
||||||
semanticNodeCounts.get(NodeType.TABLE) == null ? 0 : semanticNodeCounts.get(NodeType.TABLE),
|
semanticNodeCounts.get(NodeType.TABLE) == null ? 0 : semanticNodeCounts.get(NodeType.TABLE),
|
||||||
semanticNodeCounts.get(NodeType.TABLE_CELL) == null ? 0 : semanticNodeCounts.get(NodeType.TABLE_CELL),
|
semanticNodeCounts.get(NodeType.TABLE_CELL) == null ? 0 : semanticNodeCounts.get(NodeType.TABLE_CELL),
|
||||||
semanticNodeCounts.get(NodeType.HEADER) == null ? 0 : semanticNodeCounts.get(NodeType.HEADER),
|
semanticNodeCounts.get(NodeType.HEADER) == null ? 0 : semanticNodeCounts.get(NodeType.HEADER),
|
||||||
semanticNodeCounts.get(NodeType.FOOTER) == null ? 0 : semanticNodeCounts.get(NodeType.FOOTER));
|
semanticNodeCounts.get(NodeType.FOOTER) == null ? 0 : semanticNodeCounts.get(NodeType.FOOTER));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@ -222,6 +240,7 @@ public class LayoutParsingPipeline {
|
|||||||
|
|
||||||
PDDocument originDocument = openDocument(originFile);
|
PDDocument originDocument = openDocument(originFile);
|
||||||
addNumberOfPagesToTrace(originDocument.getNumberOfPages(), Files.size(originFile.toPath()));
|
addNumberOfPagesToTrace(originDocument.getNumberOfPages(), Files.size(originFile.toPath()));
|
||||||
|
|
||||||
Map<Integer, List<TableCells>> pdfTableCells = cvTableParsingAdapter.buildCvParsedTablesPerPage(tableServiceResponse);
|
Map<Integer, List<TableCells>> pdfTableCells = cvTableParsingAdapter.buildCvParsedTablesPerPage(tableServiceResponse);
|
||||||
Map<Integer, List<ClassifiedImage>> pdfImages = imageServiceResponseAdapter.buildClassifiedImagesPerPage(imageServiceResponse);
|
Map<Integer, List<ClassifiedImage>> pdfImages = imageServiceResponseAdapter.buildClassifiedImagesPerPage(imageServiceResponse);
|
||||||
Map<Integer, List<ClassifiedImage>> signatures = visualLayoutParsingAdapter.buildExtractedSignaturesPerPage(visualLayoutParsingResponse);
|
Map<Integer, List<ClassifiedImage>> signatures = visualLayoutParsingAdapter.buildExtractedSignaturesPerPage(visualLayoutParsingResponse);
|
||||||
@ -232,6 +251,12 @@ public class LayoutParsingPipeline {
|
|||||||
}
|
}
|
||||||
|
|
||||||
List<ClassificationPage> classificationPages = new ArrayList<>();
|
List<ClassificationPage> classificationPages = new ArrayList<>();
|
||||||
|
OutlineObject lastProcessedOutlineObject = null;
|
||||||
|
|
||||||
|
// parsing the structure elements could be useful as well
|
||||||
|
if(layoutParsingType != LayoutParsingType.REDACT_MANAGER_OLD && layoutParsingType != LayoutParsingType.DOCUMINE_OLD) {
|
||||||
|
classificationDocument.setOutlineObjectTree(outlineExtractorService.getOutlineObjectTree(originDocument));
|
||||||
|
}
|
||||||
|
|
||||||
long pageCount = originDocument.getNumberOfPages();
|
long pageCount = originDocument.getNumberOfPages();
|
||||||
|
|
||||||
@ -277,7 +302,13 @@ public class LayoutParsingPipeline {
|
|||||||
|
|
||||||
TextRulingsClassifier.classifyUnderlinedAndStrikethroughText(words, cleanRulings);
|
TextRulingsClassifier.classifyUnderlinedAndStrikethroughText(words, cleanRulings);
|
||||||
|
|
||||||
List<Box> graphics = graphicExtractorService.extractPathElementGraphics(originDocument, pdPage, pageNumber, cleanRulings, stripper.getTextPositionSequences(), false);
|
List<Box> graphics = graphicExtractorService.extractPathElementGraphics(originDocument,
|
||||||
|
pdPage,
|
||||||
|
pageNumber,
|
||||||
|
cleanRulings,
|
||||||
|
stripper.getTextPositionSequences(),
|
||||||
|
|
||||||
|
false);
|
||||||
|
|
||||||
pdfImages.computeIfAbsent(pageNumber, x -> new ArrayList<>())
|
pdfImages.computeIfAbsent(pageNumber, x -> new ArrayList<>())
|
||||||
.addAll(graphics.stream()
|
.addAll(graphics.stream()
|
||||||
@ -301,6 +332,20 @@ public class LayoutParsingPipeline {
|
|||||||
classificationPage.setPageWidth(cropbox.getWidth());
|
classificationPage.setPageWidth(cropbox.getWidth());
|
||||||
classificationPage.setPageHeight(cropbox.getHeight());
|
classificationPage.setPageHeight(cropbox.getHeight());
|
||||||
|
|
||||||
|
if(layoutParsingType != LayoutParsingType.REDACT_MANAGER_OLD && layoutParsingType != LayoutParsingType.DOCUMINE_OLD) {
|
||||||
|
List<OutlineObject> outlineObjects = classificationDocument.getOutlineObjectTree().getOutlineObjectsPerPage().getOrDefault(pageNumber - 1, new ArrayList<>());
|
||||||
|
|
||||||
|
OutlineObject notFoundOutlineObject = null;
|
||||||
|
if (lastProcessedOutlineObject != null && !lastProcessedOutlineObject.isFound()) {
|
||||||
|
lastProcessedOutlineObject.setPoint(new Point2D.Float(0, cropbox.getHeight()));
|
||||||
|
notFoundOutlineObject = lastProcessedOutlineObject;
|
||||||
|
}
|
||||||
|
if (!outlineObjects.isEmpty()) {
|
||||||
|
classificationPage.setOutlineObjects(outlineObjects);
|
||||||
|
lastProcessedOutlineObject = blockificationPostprocessingService.sanitizeOutlineBlocks(classificationPage, notFoundOutlineObject);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
classificationDocument.getVisualizations().addMarkedContentVisualizations(stripper.getMarkedContents(), pageNumber);
|
classificationDocument.getVisualizations().addMarkedContentVisualizations(stripper.getMarkedContents(), pageNumber);
|
||||||
// MarkedContent needs to be converted at this point, otherwise it leads to GC Problems in Pdfbox.
|
// MarkedContent needs to be converted at this point, otherwise it leads to GC Problems in Pdfbox.
|
||||||
classificationPage.setMarkedContentBboxPerType(convertMarkedContents(stripper.getMarkedContents()));
|
classificationPage.setMarkedContentBboxPerType(convertMarkedContents(stripper.getMarkedContents()));
|
||||||
@ -342,13 +387,22 @@ public class LayoutParsingPipeline {
|
|||||||
case CLARIFYND -> clarifyndClassificationService.classifyDocument(classificationDocument);
|
case CLARIFYND -> clarifyndClassificationService.classifyDocument(classificationDocument);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
List<TextPageBlock> headlines = classificationDocument.getPages()
|
||||||
|
.stream()
|
||||||
|
.flatMap(classificationPage -> classificationPage.getTextBlocks()
|
||||||
|
.stream()
|
||||||
|
.filter(tb -> tb instanceof TextPageBlock && tb.getClassification() != null && tb.getClassification().isHeadline())
|
||||||
|
.map(tb -> (TextPageBlock) tb))
|
||||||
|
.toList();
|
||||||
|
TableOfContents tableOfContents = outlineValidationService.createToC(headlines);
|
||||||
|
classificationDocument.setTableOfContents(tableOfContents);
|
||||||
|
|
||||||
log.info("Building Sections for {}", identifier);
|
log.info("Building Sections for {}", identifier);
|
||||||
|
|
||||||
switch (layoutParsingType) {
|
switch (layoutParsingType) {
|
||||||
case CLARIFYND_PARAGRAPH_DEBUG, REDACT_MANAGER_PARAGRAPH_DEBUG -> sectionsBuilderService.buildParagraphDebugSections(classificationDocument);
|
case CLARIFYND_PARAGRAPH_DEBUG, REDACT_MANAGER_PARAGRAPH_DEBUG -> sectionsBuilderService.buildParagraphDebugSections(classificationDocument);
|
||||||
default -> {
|
default -> {
|
||||||
sectionsBuilderService.buildSections(classificationDocument);
|
tocEnrichmentService.assignSectionBlocksAndImages(classificationDocument);
|
||||||
sectionsBuilderService.addImagesToSections(classificationDocument);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@ -3,6 +3,8 @@ package com.knecon.fforesight.service.layoutparser.processor.model;
|
|||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.model.outline.OutlineObjectTree;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.model.outline.TableOfContents;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.StringFrequencyCounter;
|
import com.knecon.fforesight.service.layoutparser.processor.model.text.StringFrequencyCounter;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.UnclassifiedText;
|
import com.knecon.fforesight.service.layoutparser.processor.model.text.UnclassifiedText;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.visualization.LayoutparsingVisualizations;
|
import com.knecon.fforesight.service.layoutparser.processor.visualization.LayoutparsingVisualizations;
|
||||||
@ -28,4 +30,7 @@ public class ClassificationDocument {
|
|||||||
|
|
||||||
private long rulesVersion;
|
private long rulesVersion;
|
||||||
|
|
||||||
|
private OutlineObjectTree outlineObjectTree;
|
||||||
|
private TableOfContents tableOfContents;
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|||||||
@ -8,13 +8,13 @@ import java.util.Map;
|
|||||||
|
|
||||||
import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.Rectangle;
|
import com.iqser.red.service.persistence.service.v1.api.shared.model.redactionlog.Rectangle;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.image.ClassifiedImage;
|
import com.knecon.fforesight.service.layoutparser.processor.model.image.ClassifiedImage;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.model.outline.OutlineObject;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings;
|
import com.knecon.fforesight.service.layoutparser.processor.model.table.CleanRulings;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.StringFrequencyCounter;
|
import com.knecon.fforesight.service.layoutparser.processor.model.text.StringFrequencyCounter;
|
||||||
|
|
||||||
import lombok.Data;
|
import lombok.Data;
|
||||||
import lombok.NonNull;
|
import lombok.NonNull;
|
||||||
import lombok.RequiredArgsConstructor;
|
import lombok.RequiredArgsConstructor;
|
||||||
import org.apache.pdfbox.pdmodel.documentinterchange.markedcontent.PDMarkedContent;
|
|
||||||
|
|
||||||
@Data
|
@Data
|
||||||
@RequiredArgsConstructor
|
@RequiredArgsConstructor
|
||||||
@ -23,6 +23,10 @@ public class ClassificationPage {
|
|||||||
@NonNull
|
@NonNull
|
||||||
private List<AbstractPageBlock> textBlocks;
|
private List<AbstractPageBlock> textBlocks;
|
||||||
|
|
||||||
|
private List<OutlineObject> outlineObjects = new ArrayList<>();
|
||||||
|
|
||||||
|
private List<AbstractPageBlock> headlines = new ArrayList<>();
|
||||||
|
|
||||||
private List<ClassifiedImage> images = new ArrayList<>();
|
private List<ClassifiedImage> images = new ArrayList<>();
|
||||||
|
|
||||||
private Rectangle bodyTextFrame;
|
private Rectangle bodyTextFrame;
|
||||||
|
|||||||
@ -12,6 +12,7 @@ import lombok.NoArgsConstructor;
|
|||||||
|
|
||||||
@Data
|
@Data
|
||||||
@NoArgsConstructor
|
@NoArgsConstructor
|
||||||
|
@Deprecated
|
||||||
public class ClassificationSection {
|
public class ClassificationSection {
|
||||||
|
|
||||||
private List<AbstractPageBlock> pageBlocks = new ArrayList<>();
|
private List<AbstractPageBlock> pageBlocks = new ArrayList<>();
|
||||||
|
|||||||
@ -31,6 +31,19 @@ public enum PageBlockType {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public static int getHeadlineNumber(PageBlockType pageBlockType) {
|
||||||
|
|
||||||
|
return switch (pageBlockType) {
|
||||||
|
case H1 -> 1;
|
||||||
|
case H2 -> 2;
|
||||||
|
case H3 -> 3;
|
||||||
|
case H4 -> 4;
|
||||||
|
case H5 -> 5;
|
||||||
|
default -> 6;
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
public boolean isHeadline() {
|
public boolean isHeadline() {
|
||||||
|
|
||||||
return this.equals(H1) || this.equals(H2) || this.equals(H3) || this.equals(H4) || this.equals(H5) || this.equals(H6);
|
return this.equals(H1) || this.equals(H2) || this.equals(H3) || this.equals(H4) || this.equals(H5) || this.equals(H6);
|
||||||
|
|||||||
@ -8,6 +8,7 @@ import java.util.regex.Pattern;
|
|||||||
|
|
||||||
import lombok.AccessLevel;
|
import lombok.AccessLevel;
|
||||||
import lombok.AllArgsConstructor;
|
import lombok.AllArgsConstructor;
|
||||||
|
import lombok.Getter;
|
||||||
import lombok.experimental.FieldDefaults;
|
import lombok.experimental.FieldDefaults;
|
||||||
|
|
||||||
@AllArgsConstructor
|
@AllArgsConstructor
|
||||||
@ -16,13 +17,15 @@ public class SectionIdentifier {
|
|||||||
|
|
||||||
static Pattern numericalIdentifierPattern = Pattern.compile("^[\\s]?(\\d+)[\\s.,;]?(\\d+)?[\\s.,;]?(\\d+)?[\\s.,;]?(\\d+)?");
|
static Pattern numericalIdentifierPattern = Pattern.compile("^[\\s]?(\\d+)[\\s.,;]?(\\d+)?[\\s.,;]?(\\d+)?[\\s.,;]?(\\d+)?");
|
||||||
|
|
||||||
private enum Format {
|
public enum Format {
|
||||||
EMPTY,
|
EMPTY,
|
||||||
NUMERICAL,
|
NUMERICAL,
|
||||||
DOCUMENT
|
DOCUMENT
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Getter
|
||||||
Format format;
|
Format format;
|
||||||
|
@Getter
|
||||||
String identifierString;
|
String identifierString;
|
||||||
List<Integer> identifiers;
|
List<Integer> identifiers;
|
||||||
boolean asChild;
|
boolean asChild;
|
||||||
|
|||||||
@ -140,8 +140,8 @@ public class DocumentTree {
|
|||||||
if (treeId.isEmpty()) {
|
if (treeId.isEmpty()) {
|
||||||
return root;
|
return root;
|
||||||
}
|
}
|
||||||
Entry entry = root.children.get(treeId.get(0));
|
Entry entry = root;
|
||||||
for (int id : treeId.subList(1, treeId.size())) {
|
for (int id : treeId) {
|
||||||
entry = entry.children.get(id);
|
entry = entry.children.get(id);
|
||||||
}
|
}
|
||||||
return entry;
|
return entry;
|
||||||
|
|||||||
@ -18,78 +18,20 @@ import lombok.Builder;
|
|||||||
import lombok.Data;
|
import lombok.Data;
|
||||||
import lombok.EqualsAndHashCode;
|
import lombok.EqualsAndHashCode;
|
||||||
import lombok.experimental.FieldDefaults;
|
import lombok.experimental.FieldDefaults;
|
||||||
|
import lombok.experimental.SuperBuilder;
|
||||||
import lombok.extern.slf4j.Slf4j;
|
import lombok.extern.slf4j.Slf4j;
|
||||||
|
|
||||||
@Slf4j
|
|
||||||
@Data
|
@Data
|
||||||
@Builder
|
@SuperBuilder
|
||||||
@AllArgsConstructor
|
@AllArgsConstructor
|
||||||
@FieldDefaults(level = AccessLevel.PRIVATE)
|
@FieldDefaults(level = AccessLevel.PRIVATE)
|
||||||
public class Section implements GenericSemanticNode {
|
@EqualsAndHashCode(callSuper = true)
|
||||||
|
public class Section extends SectionNode {
|
||||||
@Builder.Default
|
|
||||||
Set<LayoutEngine> engines = new HashSet<>(Set.of(LayoutEngine.ALGORITHM));
|
|
||||||
List<Integer> treeId;
|
|
||||||
|
|
||||||
TextBlock textBlock;
|
|
||||||
@EqualsAndHashCode.Exclude
|
|
||||||
DocumentTree documentTree;
|
|
||||||
|
|
||||||
@Builder.Default
|
|
||||||
@EqualsAndHashCode.Exclude
|
|
||||||
Set<RedactionEntity> entities = new HashSet<>();
|
|
||||||
|
|
||||||
@EqualsAndHashCode.Exclude
|
|
||||||
Map<Page, Rectangle2D> bBoxCache;
|
|
||||||
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public NodeType getType() {
|
|
||||||
|
|
||||||
return NodeType.SECTION;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
public boolean hasTables() {
|
|
||||||
|
|
||||||
return streamAllSubNodesOfType(NodeType.TABLE).findAny()
|
|
||||||
.isPresent();
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public TextBlock getTextBlock() {
|
|
||||||
|
|
||||||
if (textBlock == null) {
|
|
||||||
textBlock = GenericSemanticNode.super.getTextBlock();
|
|
||||||
}
|
|
||||||
return textBlock;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public String toString() {
|
public String toString() {
|
||||||
|
|
||||||
return treeId.toString() + ": " + NodeType.SECTION + ": " + this.getTextBlock().buildSummary();
|
return super.toString();
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
public Headline getHeadline() {
|
|
||||||
|
|
||||||
return streamChildrenOfType(NodeType.HEADLINE)//
|
|
||||||
.map(node -> (Headline) node)//
|
|
||||||
.findFirst()//
|
|
||||||
.orElseGet(() -> getParent().getHeadline());
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public Map<Page, Rectangle2D> getBBox() {
|
|
||||||
|
|
||||||
if (bBoxCache == null) {
|
|
||||||
bBoxCache = GenericSemanticNode.super.getBBox();
|
|
||||||
}
|
|
||||||
return bBoxCache;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|||||||
@ -0,0 +1,103 @@
|
|||||||
|
package com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes;
|
||||||
|
|
||||||
|
import java.awt.geom.Rectangle2D;
|
||||||
|
import java.util.HashSet;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.Map;
|
||||||
|
import java.util.Set;
|
||||||
|
|
||||||
|
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.LayoutEngine;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeType;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.model.graph.DocumentTree;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.model.graph.entity.RedactionEntity;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.TextBlock;
|
||||||
|
|
||||||
|
import lombok.AccessLevel;
|
||||||
|
import lombok.AllArgsConstructor;
|
||||||
|
import lombok.Builder;
|
||||||
|
import lombok.Data;
|
||||||
|
import lombok.EqualsAndHashCode;
|
||||||
|
import lombok.NoArgsConstructor;
|
||||||
|
import lombok.experimental.FieldDefaults;
|
||||||
|
import lombok.experimental.SuperBuilder;
|
||||||
|
import lombok.extern.slf4j.Slf4j;
|
||||||
|
|
||||||
|
@Slf4j
|
||||||
|
@Data
|
||||||
|
@SuperBuilder
|
||||||
|
@AllArgsConstructor
|
||||||
|
@NoArgsConstructor
|
||||||
|
@FieldDefaults(level = AccessLevel.PRIVATE)
|
||||||
|
public abstract class SectionNode implements GenericSemanticNode {
|
||||||
|
|
||||||
|
@Builder.Default
|
||||||
|
Set<LayoutEngine> engines = new HashSet<>(Set.of(LayoutEngine.ALGORITHM));
|
||||||
|
List<Integer> treeId;
|
||||||
|
|
||||||
|
TextBlock textBlock;
|
||||||
|
@EqualsAndHashCode.Exclude
|
||||||
|
DocumentTree documentTree;
|
||||||
|
|
||||||
|
@Builder.Default
|
||||||
|
@EqualsAndHashCode.Exclude
|
||||||
|
Set<RedactionEntity> entities = new HashSet<>();
|
||||||
|
|
||||||
|
@EqualsAndHashCode.Exclude
|
||||||
|
Map<Page, Rectangle2D> bBoxCache;
|
||||||
|
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public NodeType getType() {
|
||||||
|
|
||||||
|
return NodeType.SECTION;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public boolean hasTables() {
|
||||||
|
|
||||||
|
return streamAllSubNodesOfType(NodeType.TABLE).findAny()
|
||||||
|
.isPresent();
|
||||||
|
}
|
||||||
|
|
||||||
|
public boolean isLeafSection() {
|
||||||
|
|
||||||
|
return streamAllSubNodesOfType(NodeType.SECTION).findAny()
|
||||||
|
.isEmpty();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public TextBlock getTextBlock() {
|
||||||
|
|
||||||
|
if (textBlock == null) {
|
||||||
|
textBlock = GenericSemanticNode.super.getTextBlock();
|
||||||
|
}
|
||||||
|
return textBlock;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public String toString() {
|
||||||
|
|
||||||
|
return treeId.toString() + ": " + NodeType.SECTION + ": " + this.getTextBlock().buildSummary();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public Headline getHeadline() {
|
||||||
|
|
||||||
|
return streamChildrenOfType(NodeType.HEADLINE)//
|
||||||
|
.map(node -> (Headline) node)//
|
||||||
|
.findFirst()//
|
||||||
|
.orElseGet(() -> getParent().getHeadline());
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public Map<Page, Rectangle2D> getBBox() {
|
||||||
|
|
||||||
|
if (bBoxCache == null) {
|
||||||
|
bBoxCache = GenericSemanticNode.super.getBBox();
|
||||||
|
}
|
||||||
|
return bBoxCache;
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
@ -0,0 +1,40 @@
|
|||||||
|
package com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes;
|
||||||
|
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.Set;
|
||||||
|
|
||||||
|
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.LayoutEngine;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.internal.api.data.redaction.NodeType;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.model.graph.DocumentTree;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.model.graph.entity.RedactionEntity;
|
||||||
|
|
||||||
|
import lombok.AccessLevel;
|
||||||
|
import lombok.AllArgsConstructor;
|
||||||
|
import lombok.Builder;
|
||||||
|
import lombok.Data;
|
||||||
|
import lombok.EqualsAndHashCode;
|
||||||
|
import lombok.ToString;
|
||||||
|
import lombok.experimental.FieldDefaults;
|
||||||
|
import lombok.experimental.SuperBuilder;
|
||||||
|
|
||||||
|
@Data
|
||||||
|
@SuperBuilder
|
||||||
|
@AllArgsConstructor
|
||||||
|
@FieldDefaults(level = AccessLevel.PRIVATE)
|
||||||
|
@EqualsAndHashCode(callSuper = true)
|
||||||
|
public class SuperSection extends SectionNode {
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public NodeType getType() {
|
||||||
|
|
||||||
|
return NodeType.SUPER_SECTION;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public String toString() {
|
||||||
|
|
||||||
|
return super.toString();
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
@ -50,14 +50,16 @@ public class ConcatenatedTextBlock implements TextBlock {
|
|||||||
|
|
||||||
public ConcatenatedTextBlock concat(TextBlock textBlock) {
|
public ConcatenatedTextBlock concat(TextBlock textBlock) {
|
||||||
|
|
||||||
|
int start = textBlock.getBoundary().start();
|
||||||
|
int end = textBlock.getBoundary().end();
|
||||||
if (this.atomicTextBlocks.isEmpty()) {
|
if (this.atomicTextBlocks.isEmpty()) {
|
||||||
boundary.setStart(textBlock.getBoundary().start());
|
boundary.setStart(start);
|
||||||
boundary.setEnd(textBlock.getBoundary().end());
|
boundary.setEnd(end);
|
||||||
} else if (boundary.end() != textBlock.getBoundary().start()) {
|
} else if (boundary.end() != start) {
|
||||||
throw new UnsupportedOperationException(format("Can only concat consecutive TextBlocks, trying to concat %s and %s", boundary, textBlock.getBoundary()));
|
throw new UnsupportedOperationException(format("Can only concat consecutive TextBlocks, trying to concat %s and %s", boundary, textBlock.getBoundary()));
|
||||||
}
|
}
|
||||||
this.atomicTextBlocks.addAll(textBlock.getAtomicTextBlocks());
|
this.atomicTextBlocks.addAll(textBlock.getAtomicTextBlocks());
|
||||||
boundary.setEnd(textBlock.getBoundary().end());
|
boundary.setEnd(end);
|
||||||
this.searchText = null;
|
this.searchText = null;
|
||||||
return this;
|
return this;
|
||||||
}
|
}
|
||||||
|
|||||||
@ -0,0 +1,209 @@
|
|||||||
|
package com.knecon.fforesight.service.layoutparser.processor.model.outline;
|
||||||
|
|
||||||
|
import java.awt.geom.Point2D;
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.Locale;
|
||||||
|
import java.util.Optional;
|
||||||
|
|
||||||
|
import org.apache.pdfbox.cos.COSArray;
|
||||||
|
import org.apache.pdfbox.cos.COSBase;
|
||||||
|
import org.apache.pdfbox.cos.COSDictionary;
|
||||||
|
import org.apache.pdfbox.cos.COSName;
|
||||||
|
import org.apache.pdfbox.cos.COSString;
|
||||||
|
import org.apache.pdfbox.pdmodel.PDDestinationNameTreeNode;
|
||||||
|
import org.apache.pdfbox.pdmodel.PDDocument;
|
||||||
|
import org.apache.pdfbox.pdmodel.PDDocumentNameDictionary;
|
||||||
|
import org.apache.pdfbox.pdmodel.PDPage;
|
||||||
|
import org.apache.pdfbox.pdmodel.interactive.action.PDAction;
|
||||||
|
import org.apache.pdfbox.pdmodel.interactive.documentnavigation.destination.PDDestination;
|
||||||
|
import org.apache.pdfbox.pdmodel.interactive.documentnavigation.destination.PDPageFitHeightDestination;
|
||||||
|
import org.apache.pdfbox.pdmodel.interactive.documentnavigation.destination.PDPageFitRectangleDestination;
|
||||||
|
import org.apache.pdfbox.pdmodel.interactive.documentnavigation.destination.PDPageFitWidthDestination;
|
||||||
|
import org.apache.pdfbox.pdmodel.interactive.documentnavigation.destination.PDPageXYZDestination;
|
||||||
|
import org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDDocumentOutline;
|
||||||
|
import org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDOutlineItem;
|
||||||
|
import org.springframework.stereotype.Service;
|
||||||
|
|
||||||
|
import lombok.SneakyThrows;
|
||||||
|
import lombok.extern.slf4j.Slf4j;
|
||||||
|
|
||||||
|
@Service
|
||||||
|
@Slf4j
|
||||||
|
public class OutlineExtractorService {
|
||||||
|
|
||||||
|
private static final String PDDESTINATION_TYPE_FIT = "Fit";
|
||||||
|
private static final String PDDESTINATION_TYPE_FIT_B = "FitB";
|
||||||
|
private static final String PDDESTINATION_TYPE_FIT_H = "FitH";
|
||||||
|
private static final String PDDESTINATION_TYPE_FIT_V = "FitV";
|
||||||
|
private static final String PDDESTINATION_TYPE_FIT_R = "FitR";
|
||||||
|
private static final String PDDESTINATION_TYPE_FIT_BH = "FitBH";
|
||||||
|
private static final String PDDESTINATION_TYPE_FIT_BV = "FitBV";
|
||||||
|
private static final String PDDESTINATION_TYPE_XYZ = "XYZ";
|
||||||
|
|
||||||
|
|
||||||
|
@SneakyThrows
|
||||||
|
public OutlineObjectTree getOutlineObjectTree(PDDocument document) {
|
||||||
|
|
||||||
|
PDDocumentOutline documentOutline = document.getDocumentCatalog().getDocumentOutline();
|
||||||
|
|
||||||
|
List<OutlineObjectTreeNode> rootNodes = new ArrayList<>();
|
||||||
|
if (documentOutline != null) {
|
||||||
|
for (PDOutlineItem child : documentOutline.children()) {
|
||||||
|
Optional<OutlineObjectTreeNode> outlineObjectWithChildren = createOutlineObjectWithChildren(child, document, 1);
|
||||||
|
outlineObjectWithChildren.ifPresent(rootNodes::add);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return new OutlineObjectTree(rootNodes);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@SneakyThrows
|
||||||
|
private Optional<OutlineObjectTreeNode> createOutlineObjectWithChildren(PDOutlineItem item, PDDocument document, int depth) {
|
||||||
|
|
||||||
|
Optional<OutlineObjectTreeNode> outlineObject = createOutlineObject(item, document, depth);
|
||||||
|
if (outlineObject.isPresent()) {
|
||||||
|
for (var child : item.children()) {
|
||||||
|
Optional<OutlineObjectTreeNode> outlineObjectWithChildren = createOutlineObjectWithChildren(child, document, depth + 1);
|
||||||
|
outlineObjectWithChildren.ifPresent(outlineObjectTreeNode -> outlineObject.get().addChild(outlineObjectTreeNode));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return outlineObject;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
// if the structure elements are processed beforehand, another case can be handled here as well:
|
||||||
|
// outline objects can reference structure elements (see pdf documentation)
|
||||||
|
@SneakyThrows
|
||||||
|
private Optional<OutlineObjectTreeNode> createOutlineObject(PDOutlineItem item, PDDocument document, int depth) {
|
||||||
|
|
||||||
|
String title = item.getTitle();
|
||||||
|
|
||||||
|
PDPage page = item.findDestinationPage(document);
|
||||||
|
if (page == null) {
|
||||||
|
return Optional.empty();
|
||||||
|
}
|
||||||
|
int pageNumber = document.getPages().indexOf(page);
|
||||||
|
|
||||||
|
Optional<Point2D> outlinePosition = Optional.empty();
|
||||||
|
|
||||||
|
try {
|
||||||
|
PDDocumentNameDictionary names = document.getDocumentCatalog().getNames();
|
||||||
|
PDDestinationNameTreeNode destinations = null;
|
||||||
|
if (names != null) {
|
||||||
|
destinations = names.getDests();
|
||||||
|
}
|
||||||
|
|
||||||
|
PDDestination destination = item.getDestination();
|
||||||
|
if (destination != null) {
|
||||||
|
outlinePosition = getLocationFromCOSBase(destinations, destination.getCOSObject());
|
||||||
|
}
|
||||||
|
|
||||||
|
if (outlinePosition.isEmpty()) {
|
||||||
|
|
||||||
|
PDAction action = item.getAction();
|
||||||
|
if (action != null) {
|
||||||
|
outlinePosition = extractOutlineLocationGoTo(destinations, action.getCOSObject());
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
} catch (Exception e) {
|
||||||
|
log.info(String.format("Error occurred during position resolution for outline item on page %s with title %s: " + e, pageNumber, title));
|
||||||
|
}
|
||||||
|
|
||||||
|
return Optional.of(new OutlineObjectTreeNode(new OutlineObject(title, pageNumber, outlinePosition.orElse(new Point2D.Float(0, 0)), depth)));
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@SneakyThrows
|
||||||
|
private static Optional<Point2D> extractOutlineLocationGoTo(PDDestinationNameTreeNode destinations, COSDictionary cosDictionary) {
|
||||||
|
|
||||||
|
if (isGoToAction(cosDictionary)) {
|
||||||
|
COSBase cosBase = cosDictionary.getItem(COSName.D);
|
||||||
|
return getLocationFromCOSBase(destinations, cosBase);
|
||||||
|
}
|
||||||
|
|
||||||
|
return Optional.empty();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private static Optional<Point2D> getLocationFromCOSBase(PDDestinationNameTreeNode destinations, COSBase cosBase) throws IOException {
|
||||||
|
|
||||||
|
if (cosBase != null) {
|
||||||
|
if (cosBase instanceof COSArray cosArray) {
|
||||||
|
return getLocationFromCosArray(cosArray);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (cosBase instanceof COSString cosString) {
|
||||||
|
String destinationName = cosString.getString();
|
||||||
|
COSArray cosArray = destinations.getValue(destinationName).getCOSObject();
|
||||||
|
return getLocationFromCosArray(cosArray);
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
return Optional.empty();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private static Optional<Point2D> getLocationFromCosArray(COSArray cosArray) {
|
||||||
|
|
||||||
|
boolean located = false;
|
||||||
|
float x = 0;
|
||||||
|
float y = 0;
|
||||||
|
|
||||||
|
try {
|
||||||
|
|
||||||
|
PDDestination destination = PDDestination.create(cosArray);
|
||||||
|
COSName type = (COSName) cosArray.getObject(1);
|
||||||
|
String typeString = type.getName();
|
||||||
|
|
||||||
|
switch (typeString) {
|
||||||
|
case PDDESTINATION_TYPE_FIT_V:
|
||||||
|
case PDDESTINATION_TYPE_FIT_BV:
|
||||||
|
PDPageFitHeightDestination fitHeightDestination = (PDPageFitHeightDestination) destination;
|
||||||
|
x = fitHeightDestination.getLeft();
|
||||||
|
located = true;
|
||||||
|
break;
|
||||||
|
case PDDESTINATION_TYPE_FIT_R:
|
||||||
|
PDPageFitRectangleDestination fitRectangleDestination = (PDPageFitRectangleDestination) destination;
|
||||||
|
x = fitRectangleDestination.getLeft();
|
||||||
|
y = fitRectangleDestination.getTop();
|
||||||
|
located = true;
|
||||||
|
break;
|
||||||
|
case PDDESTINATION_TYPE_FIT_H:
|
||||||
|
case PDDESTINATION_TYPE_FIT_BH:
|
||||||
|
PDPageFitWidthDestination fitWidthDestination = (PDPageFitWidthDestination) destination;
|
||||||
|
y = fitWidthDestination.getTop();
|
||||||
|
located = true;
|
||||||
|
break;
|
||||||
|
case PDDESTINATION_TYPE_XYZ:
|
||||||
|
PDPageXYZDestination xyzDestination = (PDPageXYZDestination) destination;
|
||||||
|
x = xyzDestination.getLeft();
|
||||||
|
y = xyzDestination.getTop();
|
||||||
|
located = true;
|
||||||
|
break;
|
||||||
|
case PDDESTINATION_TYPE_FIT:
|
||||||
|
case PDDESTINATION_TYPE_FIT_B:
|
||||||
|
default:
|
||||||
|
}
|
||||||
|
|
||||||
|
} catch (IOException e) {
|
||||||
|
throw new RuntimeException(e);
|
||||||
|
}
|
||||||
|
|
||||||
|
return located ? Optional.of(new Point2D.Float(x, y)) : Optional.empty();
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private static boolean isGoToAction(COSDictionary cosDictionary) {
|
||||||
|
|
||||||
|
return cosDictionary.getNameAsString("S").toLowerCase(Locale.ROOT).equals("goto");
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
@ -0,0 +1,35 @@
|
|||||||
|
package com.knecon.fforesight.service.layoutparser.processor.model.outline;
|
||||||
|
|
||||||
|
import java.awt.geom.Point2D;
|
||||||
|
|
||||||
|
import lombok.AllArgsConstructor;
|
||||||
|
import lombok.Data;
|
||||||
|
import lombok.RequiredArgsConstructor;
|
||||||
|
|
||||||
|
@Data
|
||||||
|
@RequiredArgsConstructor
|
||||||
|
@AllArgsConstructor
|
||||||
|
public class OutlineObject {
|
||||||
|
|
||||||
|
private final String title;
|
||||||
|
private final int pageNumber;
|
||||||
|
private Point2D point;
|
||||||
|
private final int treeDepth;
|
||||||
|
|
||||||
|
private boolean found;
|
||||||
|
|
||||||
|
|
||||||
|
public OutlineObject(String title, int pageNumber, Point2D point2D, int depth) {
|
||||||
|
|
||||||
|
this(title, pageNumber, depth);
|
||||||
|
this.point = point2D;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public String toString() {
|
||||||
|
|
||||||
|
return "OutlineObject{" + "title='" + title + '\'' + '}';
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
@ -0,0 +1,42 @@
|
|||||||
|
package com.knecon.fforesight.service.layoutparser.processor.model.outline;
|
||||||
|
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.HashMap;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.Map;
|
||||||
|
|
||||||
|
import lombok.Data;
|
||||||
|
import lombok.RequiredArgsConstructor;
|
||||||
|
|
||||||
|
@Data
|
||||||
|
@RequiredArgsConstructor
|
||||||
|
public class OutlineObjectTree {
|
||||||
|
|
||||||
|
private List<OutlineObjectTreeNode> rootNodes = new ArrayList<>();
|
||||||
|
|
||||||
|
private Map<Integer, List<OutlineObject>> outlineObjectsPerPage = new HashMap<>();
|
||||||
|
|
||||||
|
|
||||||
|
public OutlineObjectTree(List<OutlineObjectTreeNode> rootNodes) {
|
||||||
|
|
||||||
|
this.rootNodes = rootNodes;
|
||||||
|
flattenNodesAndGroupByPage(rootNodes);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private void flattenNodesAndGroupByPage(List<OutlineObjectTreeNode> outlineObjectTreeNodes) {
|
||||||
|
|
||||||
|
for (OutlineObjectTreeNode node : outlineObjectTreeNodes) {
|
||||||
|
int pageNumber = node.getOutlineObject().getPageNumber();
|
||||||
|
if (!this.outlineObjectsPerPage.containsKey(pageNumber)) {
|
||||||
|
outlineObjectsPerPage.put(pageNumber, new ArrayList<>());
|
||||||
|
}
|
||||||
|
outlineObjectsPerPage.get(pageNumber).add(node.getOutlineObject());
|
||||||
|
|
||||||
|
if (!node.getChildren().isEmpty()) {
|
||||||
|
flattenNodesAndGroupByPage(node.getChildren());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
@ -0,0 +1,34 @@
|
|||||||
|
package com.knecon.fforesight.service.layoutparser.processor.model.outline;
|
||||||
|
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
import lombok.Data;
|
||||||
|
|
||||||
|
@Data
|
||||||
|
public class OutlineObjectTreeNode {
|
||||||
|
|
||||||
|
private OutlineObject outlineObject;
|
||||||
|
|
||||||
|
private List<OutlineObjectTreeNode> children = new ArrayList<>();
|
||||||
|
|
||||||
|
|
||||||
|
public OutlineObjectTreeNode(OutlineObject outlineObject) {
|
||||||
|
|
||||||
|
this.outlineObject = outlineObject;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public void addChild(OutlineObjectTreeNode outlineObject) {
|
||||||
|
|
||||||
|
children.add(outlineObject);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public String toString() {
|
||||||
|
|
||||||
|
return "OutlineObjectTreeNode{" + "outlineObject=" + outlineObject + '}';
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
@ -0,0 +1,59 @@
|
|||||||
|
package com.knecon.fforesight.service.layoutparser.processor.model.outline;
|
||||||
|
|
||||||
|
import static com.knecon.fforesight.service.layoutparser.processor.model.PageBlockType.getHeadlineNumber;
|
||||||
|
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.HashMap;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.Map;
|
||||||
|
import java.util.TreeSet;
|
||||||
|
|
||||||
|
import org.springframework.stereotype.Service;
|
||||||
|
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
|
||||||
|
|
||||||
|
import lombok.extern.slf4j.Slf4j;
|
||||||
|
|
||||||
|
@Service
|
||||||
|
@Slf4j
|
||||||
|
public class OutlineValidationService {
|
||||||
|
|
||||||
|
public TableOfContents createToC(List<TextPageBlock> headlines) {
|
||||||
|
|
||||||
|
List<TableOfContentItem> mainSections = new ArrayList<>();
|
||||||
|
Map<Integer, TableOfContentItem> lastItemsPerDepth = new HashMap<>();
|
||||||
|
TableOfContentItem last = null;
|
||||||
|
TreeSet<Integer> depths = new TreeSet<>();
|
||||||
|
|
||||||
|
for (TextPageBlock current : headlines) {
|
||||||
|
int currentDepth = getHeadlineNumber(current.getClassification());
|
||||||
|
Integer parentDepth = depths.floor(currentDepth - 1);
|
||||||
|
|
||||||
|
var tocItem = new TableOfContentItem(current);
|
||||||
|
|
||||||
|
if (parentDepth == null) {
|
||||||
|
mainSections.add(tocItem);
|
||||||
|
|
||||||
|
} else {
|
||||||
|
assert last != null;
|
||||||
|
int lastDepth = getHeadlineNumber(last.getHeadline().getClassification());
|
||||||
|
|
||||||
|
if (lastDepth < parentDepth) {
|
||||||
|
parentDepth = lastDepth;
|
||||||
|
} else if (lastDepth == currentDepth && last.getParent() != null) {
|
||||||
|
parentDepth = getHeadlineNumber(last.getParent().getHeadline().getClassification());
|
||||||
|
}
|
||||||
|
|
||||||
|
TableOfContentItem parent = lastItemsPerDepth.get(parentDepth);
|
||||||
|
parent.addChild(tocItem);
|
||||||
|
}
|
||||||
|
|
||||||
|
last = tocItem;
|
||||||
|
lastItemsPerDepth.put(currentDepth, tocItem);
|
||||||
|
depths.add(currentDepth);
|
||||||
|
}
|
||||||
|
|
||||||
|
return new TableOfContents(mainSections);
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
@ -0,0 +1,261 @@
|
|||||||
|
package com.knecon.fforesight.service.layoutparser.processor.model.outline;
|
||||||
|
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.Collections;
|
||||||
|
import java.util.Iterator;
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
import org.springframework.stereotype.Service;
|
||||||
|
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationDocument;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationFooter;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationHeader;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.model.PageBlockType;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.model.image.ClassifiedImage;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.model.table.Cell;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.model.table.TablePageBlock;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
|
||||||
|
|
||||||
|
import lombok.extern.slf4j.Slf4j;
|
||||||
|
|
||||||
|
@Slf4j
|
||||||
|
@Service
|
||||||
|
public class TOCEnrichmentService {
|
||||||
|
|
||||||
|
public void assignSectionBlocksAndImages(ClassificationDocument document) {
|
||||||
|
|
||||||
|
TableOfContents toc = document.getTableOfContents();
|
||||||
|
Iterator<TableOfContentItem> iterator = toc.iterator();
|
||||||
|
TableOfContentItem currentTOCItem = null;
|
||||||
|
if(iterator.hasNext()) {
|
||||||
|
currentTOCItem = iterator.next();
|
||||||
|
}
|
||||||
|
List<AbstractPageBlock> startBlocks = new ArrayList<>();
|
||||||
|
List<ClassifiedImage> startImages = new ArrayList<>();
|
||||||
|
TableOfContentItem currentSection = null;
|
||||||
|
boolean foundFirstHeadline = false;
|
||||||
|
|
||||||
|
List<ClassificationHeader> headers = new ArrayList<>();
|
||||||
|
List<ClassificationFooter> footers = new ArrayList<>();
|
||||||
|
TablePageBlock previousTable = null;
|
||||||
|
List<TableOfContentItem> lastFoundTOCItems = new ArrayList<>();
|
||||||
|
|
||||||
|
for (ClassificationPage page : document.getPages()) {
|
||||||
|
List<TableOfContentItem> currentPageTOCItems = new ArrayList<>();
|
||||||
|
List<TextPageBlock> header = new ArrayList<>();
|
||||||
|
List<TextPageBlock> footer = new ArrayList<>();
|
||||||
|
for (AbstractPageBlock current : page.getTextBlocks()) {
|
||||||
|
|
||||||
|
if (current.getClassification() == null) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
current.setPage(page.getPageNumber());
|
||||||
|
|
||||||
|
if (current.getClassification().equals(PageBlockType.HEADER)) {
|
||||||
|
header.add((TextPageBlock) current);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (current.getClassification().equals(PageBlockType.FOOTER)) {
|
||||||
|
footer.add((TextPageBlock) current);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (current instanceof TablePageBlock table) {
|
||||||
|
if (previousTable != null) {
|
||||||
|
mergeTableMetadata(table, previousTable);
|
||||||
|
}
|
||||||
|
previousTable = table;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (current instanceof TextPageBlock && currentTOCItem != null && currentTOCItem.getHeadline().getText().equals(current.getText())) {
|
||||||
|
if (!foundFirstHeadline) {
|
||||||
|
foundFirstHeadline = true;
|
||||||
|
}
|
||||||
|
currentSection = currentTOCItem;
|
||||||
|
currentTOCItem.getSectionBlocks().add(current);
|
||||||
|
currentPageTOCItems.add(currentTOCItem);
|
||||||
|
|
||||||
|
if(iterator.hasNext()) {
|
||||||
|
currentTOCItem = iterator.next();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!foundFirstHeadline) {
|
||||||
|
startBlocks.add(current);
|
||||||
|
} else {
|
||||||
|
currentSection.getSectionBlocks().add(current);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!currentPageTOCItems.isEmpty()) {
|
||||||
|
lastFoundTOCItems = currentPageTOCItems;
|
||||||
|
}
|
||||||
|
|
||||||
|
for (ClassifiedImage image : page.getImages()) {
|
||||||
|
|
||||||
|
Double xMin = null;
|
||||||
|
Double yMin = null;
|
||||||
|
Double xMax = null;
|
||||||
|
Double yMax = null;
|
||||||
|
|
||||||
|
for (TableOfContentItem tocItem : lastFoundTOCItems) {
|
||||||
|
var headline = tocItem.getHeadline();
|
||||||
|
|
||||||
|
if (headline.getPage() != page.getPageNumber()) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (headline.getMinX() < headline.getMaxX()) {
|
||||||
|
if (xMin == null || headline.getMinX() < xMin) {
|
||||||
|
xMin = headline.getMinX();
|
||||||
|
}
|
||||||
|
if (xMax == null || headline.getMaxX() > xMax) {
|
||||||
|
xMax = headline.getMaxX();
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
if (xMin == null || headline.getMaxX() < xMin) {
|
||||||
|
xMin = headline.getMaxX();
|
||||||
|
}
|
||||||
|
if (xMax == null || headline.getMinX() > xMax) {
|
||||||
|
xMax = headline.getMinX();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (headline.getMinY() < headline.getMaxY()) {
|
||||||
|
if (yMin == null || headline.getMinY() < yMin) {
|
||||||
|
yMin = headline.getMinY();
|
||||||
|
}
|
||||||
|
if (yMax == null || headline.getMaxY() > yMax) {
|
||||||
|
yMax = headline.getMaxY();
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
if (yMin == null || headline.getMaxY() < yMin) {
|
||||||
|
yMin = headline.getMaxY();
|
||||||
|
}
|
||||||
|
if (yMax == null || headline.getMinY() > yMax) {
|
||||||
|
yMax = headline.getMinY();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
log.debug("Image position x: {}, y: {}", image.getPosition().getX(), image.getPosition().getY());
|
||||||
|
log.debug("Headline position xMin: {}, xMax: {}, yMin: {}, yMax: {}", xMin, xMax, yMin, yMax);
|
||||||
|
|
||||||
|
if (image.getPosition().getX() >= xMin && image.getPosition().getX() <= xMax && image.getPosition().getY() >= yMin && image.getPosition().getY() <= yMax) {
|
||||||
|
tocItem.getImages().add(image);
|
||||||
|
image.setAppendedToSection(true);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (!image.isAppendedToSection()) {
|
||||||
|
log.debug("Image uses first paragraph");
|
||||||
|
if (!lastFoundTOCItems.isEmpty()) {
|
||||||
|
lastFoundTOCItems.get(0).getImages().add(image);
|
||||||
|
} else {
|
||||||
|
startImages.add(image);
|
||||||
|
}
|
||||||
|
image.setAppendedToSection(true);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!header.isEmpty()) {
|
||||||
|
headers.add(new ClassificationHeader(header));
|
||||||
|
}
|
||||||
|
if (!footer.isEmpty()) {
|
||||||
|
footers.add(new ClassificationFooter(footer));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!startBlocks.isEmpty()) {
|
||||||
|
TableOfContentItem unassigned = new TableOfContentItem(null);
|
||||||
|
unassigned.setSectionBlocks(startBlocks);
|
||||||
|
unassigned.setImages(startImages);
|
||||||
|
document.getTableOfContents().getMainSections().add(0, unassigned);
|
||||||
|
}
|
||||||
|
document.setHeaders(headers);
|
||||||
|
document.setFooters(footers);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private void mergeTableMetadata(TablePageBlock currentTable, TablePageBlock previousTable) {
|
||||||
|
|
||||||
|
// Distribute header information for subsequent tables
|
||||||
|
if (previousTable != null && hasInvalidHeaderInformation(currentTable) && hasValidHeaderInformation(previousTable)) {
|
||||||
|
List<Cell> previousTableNonHeaderRow = getRowWithNonHeaderCells(previousTable);
|
||||||
|
List<Cell> tableNonHeaderRow = getRowWithNonHeaderCells(currentTable);
|
||||||
|
// Allow merging of tables if header row is separated from first logical non-header row
|
||||||
|
if (previousTableNonHeaderRow.isEmpty()
|
||||||
|
&& previousTable.getRowCount() == 1
|
||||||
|
&& previousTable.getRows()
|
||||||
|
.get(0).size() == tableNonHeaderRow.size()) {
|
||||||
|
previousTableNonHeaderRow = previousTable.getRows()
|
||||||
|
.get(0)
|
||||||
|
.stream()
|
||||||
|
.map(cell -> {
|
||||||
|
Cell fakeCell = Cell.copy(cell);
|
||||||
|
fakeCell.setHeaderCells(Collections.singletonList(cell));
|
||||||
|
return fakeCell;
|
||||||
|
})
|
||||||
|
.toList();
|
||||||
|
}
|
||||||
|
if (previousTableNonHeaderRow.size() == tableNonHeaderRow.size()) {
|
||||||
|
for (int i = currentTable.getRowCount() - 1; i >= 0; i--) { // Non header rows are most likely at bottom of table
|
||||||
|
List<Cell> row = currentTable.getRows()
|
||||||
|
.get(i);
|
||||||
|
if (row.size() == tableNonHeaderRow.size() && row.stream()
|
||||||
|
.allMatch(cell -> cell.getHeaderCells().isEmpty())) {
|
||||||
|
for (int j = 0; j < row.size(); j++) {
|
||||||
|
row.get(j).setHeaderCells(previousTableNonHeaderRow.get(j).getHeaderCells());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private boolean hasValidHeaderInformation(TablePageBlock table) {
|
||||||
|
|
||||||
|
return !hasInvalidHeaderInformation(table);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private boolean hasInvalidHeaderInformation(TablePageBlock table) {
|
||||||
|
|
||||||
|
return table.getRows()
|
||||||
|
.stream()
|
||||||
|
.flatMap(row -> row.stream()
|
||||||
|
.filter(cell -> !cell.getHeaderCells().isEmpty()))
|
||||||
|
.findAny().isEmpty();
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private List<Cell> getRowWithNonHeaderCells(TablePageBlock table) {
|
||||||
|
|
||||||
|
for (int i = table.getRowCount() - 1; i >= 0; i--) { // Non header rows are most likely at bottom of table
|
||||||
|
List<Cell> row = table.getRows()
|
||||||
|
.get(i);
|
||||||
|
if (row.size() == 1) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
boolean allNonHeader = true;
|
||||||
|
for (Cell cell : row) {
|
||||||
|
if (cell.isHeaderCell()) {
|
||||||
|
allNonHeader = false;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (allNonHeader) {
|
||||||
|
return row;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return Collections.emptyList();
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
@ -0,0 +1,110 @@
|
|||||||
|
package com.knecon.fforesight.service.layoutparser.processor.model.outline;
|
||||||
|
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.stream.Collectors;
|
||||||
|
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBlock;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Section;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.SectionNode;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.model.image.ClassifiedImage;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
|
||||||
|
|
||||||
|
import lombok.Data;
|
||||||
|
import lombok.EqualsAndHashCode;
|
||||||
|
|
||||||
|
@Data
|
||||||
|
@EqualsAndHashCode(onlyExplicitlyIncluded = true)
|
||||||
|
public class TableOfContentItem {
|
||||||
|
|
||||||
|
@EqualsAndHashCode.Include
|
||||||
|
private TextPageBlock headline;
|
||||||
|
private List<TableOfContentItem> children = new ArrayList<>();
|
||||||
|
private TableOfContentItem parent;
|
||||||
|
|
||||||
|
private List<AbstractPageBlock> sectionBlocks = new ArrayList<>();
|
||||||
|
private List<ClassifiedImage> images = new ArrayList<>();
|
||||||
|
|
||||||
|
private SectionNode section;
|
||||||
|
|
||||||
|
|
||||||
|
public TableOfContentItem(TextPageBlock headline) {
|
||||||
|
|
||||||
|
this.headline = headline;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public void addChild(TableOfContentItem tableOfContentItem) {
|
||||||
|
|
||||||
|
children.add(tableOfContentItem);
|
||||||
|
tableOfContentItem.setParent(this);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public TableOfContentItem getSiblingBefore() {
|
||||||
|
|
||||||
|
if (parent != null) {
|
||||||
|
int index = parent.getChildren().indexOf(this);
|
||||||
|
if (index > 0) {
|
||||||
|
return parent.getChildren()
|
||||||
|
.get(index - 1);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public TableOfContentItem getSiblingAfter() {
|
||||||
|
|
||||||
|
if (parent != null) {
|
||||||
|
int index = parent.getChildren().indexOf(this);
|
||||||
|
if (index >= 0 && index < parent.getChildren().size() - 1) {
|
||||||
|
return parent.getChildren()
|
||||||
|
.get(index + 1);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public boolean contains(TextPageBlock block) {
|
||||||
|
|
||||||
|
if (headline.equals(block)) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
for (TableOfContentItem child : children) {
|
||||||
|
if (child.contains(block)) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public boolean contains(TableOfContentItem tocItem) {
|
||||||
|
|
||||||
|
if (this.equals(tocItem)) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
for (TableOfContentItem child : children) {
|
||||||
|
if (child.contains(tocItem)) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
public List<AbstractPageBlock> getNonEmptySectionBlocks() {
|
||||||
|
|
||||||
|
return sectionBlocks.stream().filter(pageBlock -> !pageBlock.isEmpty()).collect(Collectors.toList());
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public String toString() {
|
||||||
|
|
||||||
|
return "OutlineObjectTreeNode{" + "textPageBlock=" + headline + '}';
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
}
|
||||||
@ -0,0 +1,136 @@
|
|||||||
|
package com.knecon.fforesight.service.layoutparser.processor.model.outline;
|
||||||
|
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.Iterator;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.Stack;
|
||||||
|
|
||||||
|
import org.springframework.lang.NonNull;
|
||||||
|
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
|
||||||
|
|
||||||
|
import lombok.Data;
|
||||||
|
import lombok.RequiredArgsConstructor;
|
||||||
|
|
||||||
|
@Data
|
||||||
|
@RequiredArgsConstructor
|
||||||
|
public class TableOfContents implements Iterable<TableOfContentItem> {
|
||||||
|
|
||||||
|
private List<TableOfContentItem> mainSections = new ArrayList<>();
|
||||||
|
|
||||||
|
|
||||||
|
public TableOfContents(List<TableOfContentItem> mainSections) {
|
||||||
|
|
||||||
|
this.mainSections = mainSections;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public List<TextPageBlock> getAllTextPageBlocks() {
|
||||||
|
|
||||||
|
List<TextPageBlock> allTextPageBlocks = new ArrayList<>();
|
||||||
|
for (TableOfContentItem item : mainSections) {
|
||||||
|
collectTextPageBlocks(item, allTextPageBlocks);
|
||||||
|
}
|
||||||
|
return allTextPageBlocks;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private void collectTextPageBlocks(TableOfContentItem item, List<TextPageBlock> textPageBlocks) {
|
||||||
|
|
||||||
|
textPageBlocks.add(item.getHeadline());
|
||||||
|
for (TableOfContentItem child : item.getChildren()) {
|
||||||
|
collectTextPageBlocks(child, textPageBlocks);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public List<TableOfContentItem> getAllTableOfContentItems() {
|
||||||
|
|
||||||
|
List<TableOfContentItem> allItems = new ArrayList<>();
|
||||||
|
for (TableOfContentItem item : mainSections) {
|
||||||
|
collectTableOfContentItems(item, allItems);
|
||||||
|
}
|
||||||
|
return allItems;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private void collectTableOfContentItems(TableOfContentItem item, List<TableOfContentItem> allItems) {
|
||||||
|
|
||||||
|
allItems.add(item);
|
||||||
|
for (TableOfContentItem child : item.getChildren()) {
|
||||||
|
collectTableOfContentItems(child, allItems);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private boolean containsBlock(TextPageBlock block) {
|
||||||
|
|
||||||
|
for (TableOfContentItem existingItem : this.getMainSections()) {
|
||||||
|
if (existingItem.getHeadline().equals(block) || existingItem.contains(block)) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private boolean containsItem(TableOfContentItem tocItem) {
|
||||||
|
|
||||||
|
for (TableOfContentItem existingItem : this.getMainSections()) {
|
||||||
|
if (existingItem.equals(tocItem) || existingItem.contains(tocItem)) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public @NonNull Iterator<TableOfContentItem> iterator() {
|
||||||
|
|
||||||
|
return new TableOfContentItemIterator(mainSections);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private static class TableOfContentItemIterator implements Iterator<TableOfContentItem> {
|
||||||
|
|
||||||
|
private final Stack<Iterator<TableOfContentItem>> stack = new Stack<>();
|
||||||
|
|
||||||
|
|
||||||
|
TableOfContentItemIterator(List<TableOfContentItem> mainSections) {
|
||||||
|
|
||||||
|
stack.push(mainSections.iterator());
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public boolean hasNext() {
|
||||||
|
|
||||||
|
ensureStackTopIsCurrent();
|
||||||
|
return !stack.isEmpty() && stack.peek().hasNext();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public TableOfContentItem next() {
|
||||||
|
|
||||||
|
ensureStackTopIsCurrent();
|
||||||
|
TableOfContentItem currentItem = stack.peek().next();
|
||||||
|
if (currentItem.getChildren() != null && !currentItem.getChildren().isEmpty()) {
|
||||||
|
stack.push(currentItem.getChildren()
|
||||||
|
.iterator());
|
||||||
|
}
|
||||||
|
return currentItem;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private void ensureStackTopIsCurrent() {
|
||||||
|
|
||||||
|
while (!stack.isEmpty() && !stack.peek().hasNext()) {
|
||||||
|
stack.pop();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
@ -58,6 +58,20 @@ public class TextPageBlock extends AbstractPageBlock {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@JsonIgnore
|
||||||
|
public float getPageHeight() {
|
||||||
|
|
||||||
|
return sequences.get(0).getPageHeight();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@JsonIgnore
|
||||||
|
public float getPageWidth() {
|
||||||
|
|
||||||
|
return sequences.get(0).getPageWidth();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
private void calculateBBox() {
|
private void calculateBBox() {
|
||||||
|
|
||||||
if (sequences == null) {
|
if (sequences == null) {
|
||||||
@ -69,6 +83,12 @@ public class TextPageBlock extends AbstractPageBlock {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public void recalculateBBox() {
|
||||||
|
|
||||||
|
calculateBBox();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
public static TextPageBlock merge(List<TextPageBlock> textBlocksToMerge) {
|
public static TextPageBlock merge(List<TextPageBlock> textBlocksToMerge) {
|
||||||
|
|
||||||
if (textBlocksToMerge.isEmpty()) {
|
if (textBlocksToMerge.isEmpty()) {
|
||||||
|
|||||||
@ -27,8 +27,10 @@ import lombok.extern.slf4j.Slf4j;
|
|||||||
|
|
||||||
@Slf4j
|
@Slf4j
|
||||||
@Service
|
@Service
|
||||||
|
@Deprecated
|
||||||
public class SectionsBuilderService {
|
public class SectionsBuilderService {
|
||||||
|
|
||||||
|
|
||||||
public void buildSections(ClassificationDocument document) {
|
public void buildSections(ClassificationDocument document) {
|
||||||
|
|
||||||
List<AbstractPageBlock> chunkWords = new ArrayList<>();
|
List<AbstractPageBlock> chunkWords = new ArrayList<>();
|
||||||
|
|||||||
@ -0,0 +1,525 @@
|
|||||||
|
package com.knecon.fforesight.service.layoutparser.processor.services.blockification;
|
||||||
|
|
||||||
|
import static com.knecon.fforesight.service.layoutparser.processor.services.blockification.DocstrumBlockificationService.buildTextBlock;
|
||||||
|
|
||||||
|
import java.awt.geom.Rectangle2D;
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.ListIterator;
|
||||||
|
import java.util.Locale;
|
||||||
|
import java.util.function.Function;
|
||||||
|
|
||||||
|
import org.apache.commons.lang3.StringUtils;
|
||||||
|
import org.springframework.stereotype.Service;
|
||||||
|
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.model.ClassificationPage;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.model.PageBlockType;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.model.SectionIdentifier;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.model.outline.OutlineObject;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPositionSequence;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.services.factory.SearchTextWithTextPositionFactory;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.utils.RectangleTransformations;
|
||||||
|
|
||||||
|
import lombok.Data;
|
||||||
|
|
||||||
|
@Service
|
||||||
|
public class BlockificationPostprocessingService {
|
||||||
|
|
||||||
|
private static final float BLOCK_TO_OUTLINE_DISTANCE_THRESHOLD = 5.0f;
|
||||||
|
|
||||||
|
private static final Function<TextPageBlock, Rectangle2D> blockToBoundingBox = (abstractPageBlock) -> abstractPageBlock.getSequences()
|
||||||
|
.stream()
|
||||||
|
.map(textPositionSequence -> textPositionSequence.getTextPositions()
|
||||||
|
.stream()
|
||||||
|
.map(tp -> SearchTextWithTextPositionFactory.mapRedTextPositionToInitialUserSpace(tp, textPositionSequence))
|
||||||
|
.collect(RectangleTransformations.collectBBox()))
|
||||||
|
.collect(RectangleTransformations.collectBBox());
|
||||||
|
|
||||||
|
|
||||||
|
public OutlineObject sanitizeOutlineBlocks(ClassificationPage classificationPage, OutlineObject notFoundOutlineObject) {
|
||||||
|
|
||||||
|
List<OutlineObject> outlineObjects = classificationPage.getOutlineObjects();
|
||||||
|
|
||||||
|
if (getTextPageBlocks(classificationPage).isEmpty() || outlineObjects.isEmpty()) {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
float pageHeight = classificationPage.getPageHeight();
|
||||||
|
|
||||||
|
ListIterator<OutlineObject> outlineObjectListIterator = outlineObjects.listIterator();
|
||||||
|
|
||||||
|
if (notFoundOutlineObject != null) {
|
||||||
|
OutlineProcessionContext notFoundOutlineObjectProcessionContext = new OutlineProcessionContext(notFoundOutlineObject);
|
||||||
|
processTextBlocks(getTextPageBlocks(classificationPage), pageHeight, notFoundOutlineObjectProcessionContext);
|
||||||
|
|
||||||
|
OutlineObject firstOutlineObject = null;
|
||||||
|
OutlineProcessionContext firstOutlineObjectProcessionContext = null;
|
||||||
|
if (outlineObjectListIterator.hasNext()) {
|
||||||
|
firstOutlineObject = outlineObjectListIterator.next();
|
||||||
|
firstOutlineObjectProcessionContext = new OutlineProcessionContext(firstOutlineObject);
|
||||||
|
processTextBlocks(getTextPageBlocks(classificationPage), pageHeight, firstOutlineObjectProcessionContext);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!contextsOverlap(notFoundOutlineObjectProcessionContext, firstOutlineObjectProcessionContext)) {
|
||||||
|
notFoundOutlineObject.setFound(selectMatch(classificationPage, notFoundOutlineObjectProcessionContext));
|
||||||
|
}
|
||||||
|
if (firstOutlineObject != null) {
|
||||||
|
// re-create the context for the updated blocks
|
||||||
|
firstOutlineObjectProcessionContext = new OutlineProcessionContext(firstOutlineObject);
|
||||||
|
processTextBlocks(getTextPageBlocks(classificationPage), pageHeight, firstOutlineObjectProcessionContext);
|
||||||
|
firstOutlineObject.setFound(selectMatch(classificationPage, firstOutlineObjectProcessionContext));
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
outlineObjectListIterator.forEachRemaining(outlineObject -> {
|
||||||
|
OutlineProcessionContext outlineObjectProcessionContext = new OutlineProcessionContext(outlineObject);
|
||||||
|
processTextBlocks(getTextPageBlocks(classificationPage), pageHeight, outlineObjectProcessionContext);
|
||||||
|
outlineObject.setFound(selectMatch(classificationPage, outlineObjectProcessionContext));
|
||||||
|
});
|
||||||
|
|
||||||
|
if (!outlineObjects.isEmpty()) {
|
||||||
|
return outlineObjects.get(outlineObjects.size() - 1);
|
||||||
|
} else {
|
||||||
|
return notFoundOutlineObject;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private static List<TextPageBlock> getTextPageBlocks(ClassificationPage classificationPage) {
|
||||||
|
|
||||||
|
return classificationPage.getTextBlocks()
|
||||||
|
.stream()
|
||||||
|
.filter(block -> block instanceof TextPageBlock)
|
||||||
|
.map(block -> (TextPageBlock) block)
|
||||||
|
.toList();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private boolean contextsOverlap(OutlineProcessionContext notFoundOutlineObjectProcessionContext, OutlineProcessionContext firstOutlineObjectProcessionContext) {
|
||||||
|
|
||||||
|
if (firstOutlineObjectProcessionContext == null) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
String notFoundTitle = notFoundOutlineObjectProcessionContext.getOutlineObject().getTitle();
|
||||||
|
String firstTitle = firstOutlineObjectProcessionContext.getOutlineObject().getTitle();
|
||||||
|
|
||||||
|
if (!firstTitle.startsWith(notFoundTitle)) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
var blocksOfNotFoundOutline = getAllMatchingBlocks(notFoundOutlineObjectProcessionContext);
|
||||||
|
var blocksOfFirstOutline = getAllMatchingBlocks(firstOutlineObjectProcessionContext);
|
||||||
|
|
||||||
|
double maxYFirst = blocksOfFirstOutline.stream()
|
||||||
|
.mapToDouble(TextPageBlock::getPdfMaxY)
|
||||||
|
.max()
|
||||||
|
.orElse(Double.NEGATIVE_INFINITY);
|
||||||
|
|
||||||
|
return blocksOfNotFoundOutline.stream()
|
||||||
|
.mapToDouble(TextPageBlock::getPdfMaxY)
|
||||||
|
.anyMatch(y -> y >= maxYFirst);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private List<TextPageBlock> getAllMatchingBlocks(OutlineProcessionContext context) {
|
||||||
|
|
||||||
|
List<TextPageBlock> blocks = new ArrayList<>();
|
||||||
|
if (context.getDirectMatch() != null) {
|
||||||
|
blocks.add(context.getDirectMatch());
|
||||||
|
}
|
||||||
|
if (context.getSplitCandidate() != null) {
|
||||||
|
blocks.add(context.getSplitCandidate());
|
||||||
|
}
|
||||||
|
blocks.addAll(context.getMergeCandidates());
|
||||||
|
return blocks;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private void processTextBlocks(List<TextPageBlock> textBlocks, float pageHeight, OutlineProcessionContext context) {
|
||||||
|
|
||||||
|
OutlineObject outlineObject = context.getOutlineObject();
|
||||||
|
ListIterator<TextPageBlock> iterator = textBlocks.listIterator();
|
||||||
|
while (iterator.hasNext()) {
|
||||||
|
TextPageBlock pageBlock = iterator.next();
|
||||||
|
if (pageHeight - outlineObject.getPoint().getY() - BLOCK_TO_OUTLINE_DISTANCE_THRESHOLD <= pageBlock.getMaxY()) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (iterator.hasPrevious()) {
|
||||||
|
iterator.previous();
|
||||||
|
}
|
||||||
|
boolean earlyStop = false;
|
||||||
|
while (iterator.hasNext() && !earlyStop) {
|
||||||
|
TextPageBlock pageBlock = iterator.next();
|
||||||
|
earlyStop = processOutlineForTextBlock(pageBlock, context);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private boolean selectMatch(ClassificationPage classificationPage, OutlineProcessionContext context) {
|
||||||
|
|
||||||
|
OutlineObject outlineObject = context.outlineObject;
|
||||||
|
TextPageBlock directMatch = context.directMatch;
|
||||||
|
List<TextPageBlock> mergeCandidates = context.mergeCandidates;
|
||||||
|
TextPageBlock splitCandidate = context.splitCandidate;
|
||||||
|
PageBlockType headlineType = PageBlockType.getHeadlineType(outlineObject.getTreeDepth());
|
||||||
|
|
||||||
|
double distanceToDirectMatch = directMatch != null ? calculateDistance(outlineObject, directMatch) : Double.MAX_VALUE;
|
||||||
|
double distanceToSplitCandidate = splitCandidate != null ? calculateDistance(outlineObject, splitCandidate) : Double.MAX_VALUE;
|
||||||
|
|
||||||
|
double distanceToBestMergeCandidates = Double.MAX_VALUE;
|
||||||
|
List<TextPageBlock> bestMergeCandidateCombination = new ArrayList<>();
|
||||||
|
if (!mergeCandidates.isEmpty()) {
|
||||||
|
|
||||||
|
// with this code adjacent blocks to the first and last merge candidate get added, this could be useful for some edge cases:
|
||||||
|
//List<TextPageBlock> allMergeCandidates = new ArrayList<>(mergeCandidates);
|
||||||
|
//addNeighborsOfCandidate(kdTree, mergeCandidates.get(0), allMergeCandidates);
|
||||||
|
//if (mergeCandidates.size() > 1) {
|
||||||
|
// addNeighborsOfCandidate(kdTree, mergeCandidates.get(mergeCandidates.size() - 1), allMergeCandidates);
|
||||||
|
//}
|
||||||
|
//allMergeCandidates = allMergeCandidates.stream()
|
||||||
|
// .distinct()
|
||||||
|
// .toList();
|
||||||
|
|
||||||
|
List<List<TextPageBlock>> combinations = findCombinations(outlineObject.getTitle(), mergeCandidates);
|
||||||
|
|
||||||
|
for (List<TextPageBlock> combination : combinations) {
|
||||||
|
double averageDistance = combination.stream()
|
||||||
|
.map(block -> calculateDistance(outlineObject, block))
|
||||||
|
.mapToDouble(Double::doubleValue).average()
|
||||||
|
.orElse(Double.MAX_VALUE);
|
||||||
|
if (distanceToBestMergeCandidates > averageDistance) {
|
||||||
|
distanceToBestMergeCandidates = averageDistance;
|
||||||
|
bestMergeCandidateCombination = combination;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
double minDistance = Math.min(distanceToDirectMatch, Math.min(distanceToSplitCandidate, distanceToBestMergeCandidates));
|
||||||
|
|
||||||
|
if (minDistance == Double.MAX_VALUE) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
if (minDistance == distanceToDirectMatch) {
|
||||||
|
directMatch.setClassification(headlineType);
|
||||||
|
} else if (minDistance == distanceToSplitCandidate) {
|
||||||
|
SplitBlockResult splitBlockResult = splitBlock(classificationPage, splitCandidate, context.sectionIdentifier, outlineObject.getTitle());
|
||||||
|
if (splitBlockResult.modifiedBlockToSplit) {
|
||||||
|
splitCandidate.setClassification(headlineType);
|
||||||
|
}
|
||||||
|
splitBlockResult.otherBlocks.forEach(other -> other.setClassification(null));
|
||||||
|
} else {
|
||||||
|
var merged = mergeBlocks(classificationPage, bestMergeCandidateCombination);
|
||||||
|
merged.setClassification(headlineType);
|
||||||
|
}
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private SplitBlockResult splitBlock(ClassificationPage classificationPage, TextPageBlock blockToSplit, SectionIdentifier sectionIdentifier, String title) {
|
||||||
|
|
||||||
|
List<TextPageBlock> otherBlocks = new ArrayList<>();
|
||||||
|
int blockToSplitIdx = classificationPage.getTextBlocks().indexOf(blockToSplit);
|
||||||
|
|
||||||
|
String headline = title;
|
||||||
|
if (!sectionIdentifier.getFormat().equals(SectionIdentifier.Format.EMPTY) && !title.startsWith(sectionIdentifier.getIdentifierString())) {
|
||||||
|
headline = sectionIdentifier + headline;
|
||||||
|
}
|
||||||
|
|
||||||
|
WordSequenceResult wordSequenceResult = findWordSequence(blockToSplit.getSequences(), headline);
|
||||||
|
if (wordSequenceResult.inSequence.isEmpty() && !headline.equals(title)) {
|
||||||
|
wordSequenceResult = findWordSequence(blockToSplit.getSequences(), title);
|
||||||
|
}
|
||||||
|
|
||||||
|
boolean modifiedBlockToSplit = false;
|
||||||
|
if (!wordSequenceResult.inSequence.isEmpty()) {
|
||||||
|
blockToSplit.setSequences(wordSequenceResult.inSequence);
|
||||||
|
blockToSplit.recalculateBBox();
|
||||||
|
modifiedBlockToSplit = true;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!wordSequenceResult.preSequence.isEmpty()) {
|
||||||
|
TextPageBlock block = buildTextBlock(wordSequenceResult.preSequence, 0);
|
||||||
|
classificationPage.getTextBlocks().add(blockToSplitIdx, block);
|
||||||
|
otherBlocks.add(block);
|
||||||
|
blockToSplitIdx++;
|
||||||
|
}
|
||||||
|
if (!wordSequenceResult.postSequence.isEmpty()) {
|
||||||
|
TextPageBlock block = buildTextBlock(wordSequenceResult.postSequence, 0);
|
||||||
|
classificationPage.getTextBlocks().add(blockToSplitIdx + 1, block);
|
||||||
|
otherBlocks.add(block);
|
||||||
|
}
|
||||||
|
|
||||||
|
return new SplitBlockResult(modifiedBlockToSplit, otherBlocks);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private static WordSequenceResult findWordSequence(List<TextPositionSequence> textPositionSequences, String text) {
|
||||||
|
|
||||||
|
String target = sanitizeString(text);
|
||||||
|
List<TextPositionSequence> inSequence = new ArrayList<>();
|
||||||
|
List<TextPositionSequence> preSequence = new ArrayList<>();
|
||||||
|
List<TextPositionSequence> postSequence = new ArrayList<>();
|
||||||
|
StringBuilder currentSequence = new StringBuilder();
|
||||||
|
|
||||||
|
for (TextPositionSequence sequence : textPositionSequences) {
|
||||||
|
|
||||||
|
currentSequence.append(sanitizeString(sequence.toString()));
|
||||||
|
inSequence.add(sequence);
|
||||||
|
|
||||||
|
if (currentSequence.length() >= target.length()) {
|
||||||
|
|
||||||
|
if (currentSequence.toString().endsWith(target)) {
|
||||||
|
|
||||||
|
int index = 0;
|
||||||
|
String toRemove = currentSequence.substring(0, currentSequence.length() - target.length());
|
||||||
|
|
||||||
|
TextPositionSequence next = inSequence.get(index);
|
||||||
|
while (currentSequence.length() - next.length() >= target.length()) {
|
||||||
|
|
||||||
|
TextPositionSequence removed = inSequence.remove(index);
|
||||||
|
currentSequence.delete(0, removed.toString().length());
|
||||||
|
preSequence.add(removed);
|
||||||
|
|
||||||
|
next = inSequence.get(index);
|
||||||
|
toRemove = toRemove.substring(removed.length());
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!toRemove.isEmpty()) {
|
||||||
|
SplitSequenceResult splitSequenceResult = splitSequence(inSequence.remove(index), toRemove);
|
||||||
|
|
||||||
|
currentSequence.delete(0, splitSequenceResult.out.length());
|
||||||
|
preSequence.add(splitSequenceResult.out);
|
||||||
|
inSequence.add(index, splitSequenceResult.in);
|
||||||
|
}
|
||||||
|
|
||||||
|
} else if (currentSequence.toString().startsWith(target)) {
|
||||||
|
|
||||||
|
int index = inSequence.size() - 1;
|
||||||
|
String toRemove = currentSequence.substring(target.length());
|
||||||
|
|
||||||
|
SplitSequenceResult splitSequenceResult = splitSequence(inSequence.remove(index), toRemove);
|
||||||
|
currentSequence.delete(currentSequence.length() - splitSequenceResult.out.length(), currentSequence.length());
|
||||||
|
|
||||||
|
inSequence.add(index, splitSequenceResult.in);
|
||||||
|
postSequence.add(splitSequenceResult.out);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (currentSequence.toString().equals(target)) {
|
||||||
|
postSequence.addAll(textPositionSequences.subList(textPositionSequences.indexOf(sequence) + 1, textPositionSequences.size()));
|
||||||
|
return new WordSequenceResult(inSequence, preSequence, postSequence);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return new WordSequenceResult();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private static SplitSequenceResult splitSequence(TextPositionSequence sequence, String toRemove) {
|
||||||
|
|
||||||
|
TextPositionSequence in = null;
|
||||||
|
TextPositionSequence out;
|
||||||
|
|
||||||
|
String currentSequence = sequence.toString();
|
||||||
|
int index = currentSequence.indexOf(toRemove);
|
||||||
|
int endIndex = index + toRemove.length();
|
||||||
|
|
||||||
|
out = createSubSequence(sequence, index, endIndex);
|
||||||
|
|
||||||
|
if (index > 0) {
|
||||||
|
in = createSubSequence(sequence, 0, index);
|
||||||
|
} else if (endIndex < sequence.getTextPositions().size()) {
|
||||||
|
in = createSubSequence(sequence, endIndex, sequence.getTextPositions().size());
|
||||||
|
}
|
||||||
|
|
||||||
|
return new SplitSequenceResult(in, out);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private static TextPositionSequence createSubSequence(TextPositionSequence sequence, int start, int end) {
|
||||||
|
|
||||||
|
TextPositionSequence newSeq = new TextPositionSequence(new ArrayList<>(sequence.getTextPositions().subList(start, end)), sequence.getPage());
|
||||||
|
newSeq.setParagraphStart(sequence.isParagraphStart());
|
||||||
|
return newSeq;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private TextPageBlock mergeBlocks(ClassificationPage classificationPage, List<TextPageBlock> blocksToMerge) {
|
||||||
|
|
||||||
|
TextPageBlock firstBlock = blocksToMerge.get(0);
|
||||||
|
|
||||||
|
if (blocksToMerge.size() > 1) {
|
||||||
|
|
||||||
|
List<TextPageBlock> mergedBlocks = new ArrayList<>();
|
||||||
|
for (TextPageBlock textPageBlock : blocksToMerge.subList(1, blocksToMerge.size())) {
|
||||||
|
|
||||||
|
if (firstBlock != null && !firstBlock.getSequences().isEmpty()) {
|
||||||
|
|
||||||
|
if (textPageBlock.getDir() == firstBlock.getDir()) {
|
||||||
|
firstBlock.getSequences().addAll(textPageBlock.getSequences());
|
||||||
|
mergedBlocks.add(textPageBlock);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
assert firstBlock != null;
|
||||||
|
firstBlock.setToDuplicate(false);
|
||||||
|
firstBlock.recalculateBBox();
|
||||||
|
classificationPage.getTextBlocks().removeAll(mergedBlocks);
|
||||||
|
}
|
||||||
|
|
||||||
|
return firstBlock;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private static List<List<TextPageBlock>> findCombinations(String title, List<TextPageBlock> blocks) {
|
||||||
|
|
||||||
|
List<List<TextPageBlock>> combinations = new ArrayList<>();
|
||||||
|
findCombinations(title, blocks, new ArrayList<>(), combinations);
|
||||||
|
return combinations;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private static void findCombinations(String title, List<TextPageBlock> blocks, List<TextPageBlock> current, List<List<TextPageBlock>> combinations) {
|
||||||
|
|
||||||
|
String target = sanitizeString(title);
|
||||||
|
if (target.isEmpty()) {
|
||||||
|
combinations.add(new ArrayList<>(current));
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
List<TextPageBlock> remaining = blocks.stream()
|
||||||
|
.filter(block -> !current.contains(block))
|
||||||
|
.toList();
|
||||||
|
for (TextPageBlock block : remaining) {
|
||||||
|
String prefix = sanitizeString(block.getText());
|
||||||
|
if (target.startsWith(prefix)) {
|
||||||
|
current.add(block);
|
||||||
|
findCombinations(target.substring(prefix.length()), blocks.subList(blocks.indexOf(block) + 1, blocks.size()), current, combinations);
|
||||||
|
current.remove(current.size() - 1);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private double calculateDistance(OutlineObject outlineObject, TextPageBlock pageBlock) {
|
||||||
|
|
||||||
|
double deltaX = outlineObject.getPoint().getX() - pageBlock.getMinX();
|
||||||
|
double deltaY = pageBlock.getPageHeight() - outlineObject.getPoint().getY() - pageBlock.getMinY();
|
||||||
|
return Math.sqrt(deltaX * deltaX + deltaY * deltaY);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
// currently only three cases are handled here:
|
||||||
|
// 1. equality
|
||||||
|
// 2. outline title contains block text
|
||||||
|
// 3. block text contains outline title
|
||||||
|
// another possible case is an intersection, meaning a title is split up between two different blocks
|
||||||
|
// this should not happen with how docstrum creates the blocks
|
||||||
|
// if it is indeed necessary, a splitting has to be done with a follow-up merge
|
||||||
|
private boolean processOutlineForTextBlock(TextPageBlock pageBlock, OutlineProcessionContext context) {
|
||||||
|
|
||||||
|
OutlineObject outlineObject = context.getOutlineObject();
|
||||||
|
String blockText = sanitizeString(pageBlock.getText());
|
||||||
|
String outlineTitle = sanitizeString(outlineObject.getTitle());
|
||||||
|
|
||||||
|
boolean blockTextContainsOutlineTitle = blockText.contains(outlineTitle);
|
||||||
|
boolean outlineTitleContainsBlockText = outlineTitle.contains(blockText);
|
||||||
|
|
||||||
|
if (!blockTextContainsOutlineTitle && !outlineTitleContainsBlockText) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (blockText.equals(outlineTitle) && context.directMatch == null) {
|
||||||
|
context.directMatch = pageBlock;
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (outlineTitleContainsBlockText) {
|
||||||
|
context.mergeCandidates.add(pageBlock);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (blockTextContainsOutlineTitle) {
|
||||||
|
SectionIdentifier sectionIdentifier = SectionIdentifier.fromSearchText(blockText);
|
||||||
|
|
||||||
|
if (sectionIdentifier.getFormat() != SectionIdentifier.Format.EMPTY && !outlineTitle.startsWith(sectionIdentifier.getIdentifierString())) {
|
||||||
|
|
||||||
|
if (blockText.startsWith(sectionIdentifier.getIdentifierString()) && blockText.endsWith(outlineTitle) && context.directMatch == null) {
|
||||||
|
context.directMatch = pageBlock;
|
||||||
|
return true;
|
||||||
|
} else if (context.splitCandidate == null) {
|
||||||
|
context.sectionIdentifier = sectionIdentifier;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (context.splitCandidate == null) {
|
||||||
|
context.splitCandidate = pageBlock;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private static String sanitizeString(String text) {
|
||||||
|
|
||||||
|
return StringUtils.deleteWhitespace(text).toLowerCase(Locale.ROOT);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@Data
|
||||||
|
private static class OutlineProcessionContext {
|
||||||
|
|
||||||
|
private TextPageBlock directMatch;
|
||||||
|
private OutlineObject outlineObject;
|
||||||
|
private List<TextPageBlock> mergeCandidates;
|
||||||
|
private TextPageBlock splitCandidate;
|
||||||
|
private SectionIdentifier sectionIdentifier;
|
||||||
|
|
||||||
|
|
||||||
|
OutlineProcessionContext(OutlineObject outlineObject) {
|
||||||
|
|
||||||
|
this.outlineObject = outlineObject;
|
||||||
|
this.directMatch = null;
|
||||||
|
this.mergeCandidates = new ArrayList<>();
|
||||||
|
this.splitCandidate = null;
|
||||||
|
this.sectionIdentifier = SectionIdentifier.empty();
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
public static class WordSequenceResult {
|
||||||
|
|
||||||
|
public List<TextPositionSequence> inSequence;
|
||||||
|
public List<TextPositionSequence> preSequence;
|
||||||
|
public List<TextPositionSequence> postSequence;
|
||||||
|
|
||||||
|
|
||||||
|
public WordSequenceResult(List<TextPositionSequence> inSequence, List<TextPositionSequence> preSequence, List<TextPositionSequence> postSequence) {
|
||||||
|
|
||||||
|
this.inSequence = inSequence;
|
||||||
|
this.preSequence = preSequence;
|
||||||
|
this.postSequence = postSequence;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public WordSequenceResult() {
|
||||||
|
|
||||||
|
this.inSequence = new ArrayList<>();
|
||||||
|
this.preSequence = new ArrayList<>();
|
||||||
|
this.postSequence = new ArrayList<>();
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
public record SplitBlockResult(boolean modifiedBlockToSplit, List<TextPageBlock> otherBlocks) {
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
public record SplitSequenceResult(TextPositionSequence in, TextPositionSequence out) {
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
@ -126,6 +126,16 @@ public class DocstrumBlockificationService {
|
|||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (current.isHeadline() || previous.isHeadline()) {
|
||||||
|
if (intersectsYWithPreviousHavingMaxOneLine(previous, current, page)) {
|
||||||
|
previous = combineBlocksAndResetIterator(previous, current, itty, false);
|
||||||
|
} else {
|
||||||
|
previous = current;
|
||||||
|
}
|
||||||
|
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
if (areTheOnlyTwoBlocksOnHeightsWithBothMoreThanTwoLines(previous, current, page)) {
|
if (areTheOnlyTwoBlocksOnHeightsWithBothMoreThanTwoLines(previous, current, page)) {
|
||||||
previous = combineBlocksAndResetIterator(previous, current, itty, true);
|
previous = combineBlocksAndResetIterator(previous, current, itty, true);
|
||||||
continue;
|
continue;
|
||||||
@ -172,6 +182,12 @@ public class DocstrumBlockificationService {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private boolean intersectsYWithPreviousHavingMaxOneLine(TextPageBlock previous, TextPageBlock current, ClassificationPage page) {
|
||||||
|
|
||||||
|
return previous.intersectsY(current) && (previous.getNumberOfLines() == 1 && current.getNumberOfLines() >= 1);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
private boolean areTheOnlyTwoBlocksOnHeightsWithBothMoreThanTwoLines(TextPageBlock previous, TextPageBlock current, ClassificationPage page) {
|
private boolean areTheOnlyTwoBlocksOnHeightsWithBothMoreThanTwoLines(TextPageBlock previous, TextPageBlock current, ClassificationPage page) {
|
||||||
|
|
||||||
return previous.getNumberOfLines() >= 2 && current.getNumberOfLines() >= 2 //
|
return previous.getNumberOfLines() >= 2 && current.getNumberOfLines() >= 2 //
|
||||||
@ -185,6 +201,9 @@ public class DocstrumBlockificationService {
|
|||||||
previous.getSequences().addAll(current.getSequences());
|
previous.getSequences().addAll(current.getSequences());
|
||||||
previous = buildTextBlock(previous.getSequences(), 0);
|
previous = buildTextBlock(previous.getSequences(), 0);
|
||||||
previous.setToDuplicate(toDuplicate);
|
previous.setToDuplicate(toDuplicate);
|
||||||
|
if (current.getClassification() != null && previous.getClassification() == null) {
|
||||||
|
previous.setClassification(current.getClassification());
|
||||||
|
}
|
||||||
itty.remove();
|
itty.remove();
|
||||||
itty.previous();
|
itty.previous();
|
||||||
itty.set(previous);
|
itty.set(previous);
|
||||||
@ -244,21 +263,30 @@ public class DocstrumBlockificationService {
|
|||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (block.getClassification() != null && block.getClassification().isHeadline()) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
TextPageBlock current = (TextPageBlock) block;
|
TextPageBlock current = (TextPageBlock) block;
|
||||||
|
|
||||||
for (int i = 0; i < blocks.size(); i++) {
|
for (int i = 0; i < blocks.size(); i++) {
|
||||||
|
|
||||||
if (blocks.get(i) == null) {
|
AbstractPageBlock abstractPageBlock = blocks.get(i);
|
||||||
|
if (abstractPageBlock == null) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
if (blocks.get(i) == current) {
|
if (abstractPageBlock == current) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
if (blocks.get(i) instanceof TablePageBlock) {
|
if (abstractPageBlock instanceof TablePageBlock) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
TextPageBlock inner = (TextPageBlock) blocks.get(i);
|
if (abstractPageBlock.getClassification() != null && abstractPageBlock.getClassification().isHeadline()) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
TextPageBlock inner = (TextPageBlock) abstractPageBlock;
|
||||||
|
|
||||||
if (usedRulings.lineBetween(current, blocks.get(i))) {
|
if (usedRulings.lineBetween(current, blocks.get(i))) {
|
||||||
continue;
|
continue;
|
||||||
@ -285,7 +313,7 @@ public class DocstrumBlockificationService {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
private TextPageBlock buildTextBlock(List<TextPositionSequence> wordBlockList, int indexOnPage) {
|
public static TextPageBlock buildTextBlock(List<TextPositionSequence> wordBlockList, int indexOnPage) {
|
||||||
|
|
||||||
return new TextPageBlock(wordBlockList);
|
return new TextPageBlock(wordBlockList);
|
||||||
}
|
}
|
||||||
|
|||||||
@ -21,12 +21,16 @@ import lombok.extern.slf4j.Slf4j;
|
|||||||
@RequiredArgsConstructor
|
@RequiredArgsConstructor
|
||||||
public class ClarifyndClassificationService {
|
public class ClarifyndClassificationService {
|
||||||
|
|
||||||
|
private final HeadlineClassificationService headlineClassificationService;
|
||||||
|
|
||||||
public void classifyDocument(ClassificationDocument document) {
|
public void classifyDocument(ClassificationDocument document) {
|
||||||
|
|
||||||
List<Float> headlineFontSizes = document.getFontSizeCounter().getHighterThanMostPopular();
|
List<Float> headlineFontSizes = document.getFontSizeCounter().getHighterThanMostPopular();
|
||||||
|
|
||||||
log.debug("Document FontSize counters are: {}", document.getFontSizeCounter().getCountPerValue());
|
log.debug("Document FontSize counters are: {}", document.getFontSizeCounter().getCountPerValue());
|
||||||
|
|
||||||
|
headlineClassificationService.resetContext();
|
||||||
|
|
||||||
for (ClassificationPage page : document.getPages()) {
|
for (ClassificationPage page : document.getPages()) {
|
||||||
classifyPage(page, document, headlineFontSizes);
|
classifyPage(page, document, headlineFontSizes);
|
||||||
}
|
}
|
||||||
@ -47,6 +51,10 @@ public class ClarifyndClassificationService {
|
|||||||
|
|
||||||
var bodyTextFrame = page.getBodyTextFrame();
|
var bodyTextFrame = page.getBodyTextFrame();
|
||||||
|
|
||||||
|
if (textBlock.getClassification() != null && textBlock.getClassification().isHeadline()) {
|
||||||
|
headlineClassificationService.setLastHeadlineFromOutline(textBlock);
|
||||||
|
return;
|
||||||
|
}
|
||||||
if (document.getFontSizeCounter().getMostPopular() == null) {
|
if (document.getFontSizeCounter().getMostPopular() == null) {
|
||||||
textBlock.setClassification(PageBlockType.PARAGRAPH);
|
textBlock.setClassification(PageBlockType.PARAGRAPH);
|
||||||
return;
|
return;
|
||||||
@ -79,7 +87,8 @@ public class ClarifyndClassificationService {
|
|||||||
|
|
||||||
for (int i = 1; i <= headlineFontSizes.size(); i++) {
|
for (int i = 1; i <= headlineFontSizes.size(); i++) {
|
||||||
if (textBlock.getMostPopularWordFontSize() == headlineFontSizes.get(i - 1)) {
|
if (textBlock.getMostPopularWordFontSize() == headlineFontSizes.get(i - 1)) {
|
||||||
textBlock.setClassification(PageBlockType.getHeadlineType(i));
|
PageBlockType headlineType = PageBlockType.getHeadlineType(i);
|
||||||
|
headlineClassificationService.classifyHeadline(textBlock, headlineType);
|
||||||
document.setHeadlines(true);
|
document.setHeadlines(true);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -89,7 +98,8 @@ public class ClarifyndClassificationService {
|
|||||||
.getTextPositions()
|
.getTextPositions()
|
||||||
.get(0)
|
.get(0)
|
||||||
.getFontSizeInPt() >= textBlock.getMostPopularWordFontSize()) {
|
.getFontSizeInPt() >= textBlock.getMostPopularWordFontSize()) {
|
||||||
textBlock.setClassification(PageBlockType.getHeadlineType(headlineFontSizes.size() + 1));
|
PageBlockType headlineType = PageBlockType.getHeadlineType(headlineFontSizes.size() + 1);
|
||||||
|
headlineClassificationService.classifyHeadline(textBlock, headlineType);
|
||||||
document.setHeadlines(true);
|
document.setHeadlines(true);
|
||||||
} else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock) && textBlock.getMostPopularWordFontSize() == document.getFontSizeCounter()
|
} else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock) && textBlock.getMostPopularWordFontSize() == document.getFontSizeCounter()
|
||||||
.getMostPopular() && textBlock.getMostPopularWordStyle().equals("bold") && !document.getFontStyleCounter().getMostPopular().equals("bold")) {
|
.getMostPopular() && textBlock.getMostPopularWordStyle().equals("bold") && !document.getFontStyleCounter().getMostPopular().equals("bold")) {
|
||||||
|
|||||||
@ -24,6 +24,7 @@ import lombok.extern.slf4j.Slf4j;
|
|||||||
@RequiredArgsConstructor
|
@RequiredArgsConstructor
|
||||||
public class DocuMineClassificationService {
|
public class DocuMineClassificationService {
|
||||||
|
|
||||||
|
private final HeadlineClassificationService headlineClassificationService;
|
||||||
private static final Pattern pattern = Pattern.compile("^([1-9]\\d?\\.){1,3}\\d{1,2}\\.?\\s[0-9A-Za-z \\[\\]]{2,50}", Pattern.CASE_INSENSITIVE);
|
private static final Pattern pattern = Pattern.compile("^([1-9]\\d?\\.){1,3}\\d{1,2}\\.?\\s[0-9A-Za-z \\[\\]]{2,50}", Pattern.CASE_INSENSITIVE);
|
||||||
private static final Pattern pattern2 = Pattern.compile("\\p{L}{3,}", Pattern.CASE_INSENSITIVE);
|
private static final Pattern pattern2 = Pattern.compile("\\p{L}{3,}", Pattern.CASE_INSENSITIVE);
|
||||||
private static final Pattern pattern3 = Pattern.compile("^(\\d{1,1}\\.){1,3}\\d{1,2}\\.?\\s[a-z]{1,2}\\/[a-z]{1,2}.*");
|
private static final Pattern pattern3 = Pattern.compile("^(\\d{1,1}\\.){1,3}\\d{1,2}\\.?\\s[a-z]{1,2}\\/[a-z]{1,2}.*");
|
||||||
@ -35,6 +36,8 @@ public class DocuMineClassificationService {
|
|||||||
|
|
||||||
log.debug("Document FontSize counters are: {}", document.getFontSizeCounter().getCountPerValue());
|
log.debug("Document FontSize counters are: {}", document.getFontSizeCounter().getCountPerValue());
|
||||||
|
|
||||||
|
headlineClassificationService.resetContext();
|
||||||
|
|
||||||
for (ClassificationPage page : document.getPages()) {
|
for (ClassificationPage page : document.getPages()) {
|
||||||
classifyPage(page, document, headlineFontSizes);
|
classifyPage(page, document, headlineFontSizes);
|
||||||
}
|
}
|
||||||
@ -60,6 +63,10 @@ public class DocuMineClassificationService {
|
|||||||
Matcher matcher2 = pattern2.matcher(textBlock.toString());
|
Matcher matcher2 = pattern2.matcher(textBlock.toString());
|
||||||
Matcher matcher3 = pattern3.matcher(textBlock.toString());
|
Matcher matcher3 = pattern3.matcher(textBlock.toString());
|
||||||
|
|
||||||
|
if (textBlock.getClassification() != null && textBlock.getClassification().isHeadline()) {
|
||||||
|
headlineClassificationService.setLastHeadlineFromOutline(textBlock);
|
||||||
|
return;
|
||||||
|
}
|
||||||
if (document.getFontSizeCounter().getMostPopular() == null) {
|
if (document.getFontSizeCounter().getMostPopular() == null) {
|
||||||
textBlock.setClassification(PageBlockType.OTHER);
|
textBlock.setClassification(PageBlockType.OTHER);
|
||||||
return;
|
return;
|
||||||
@ -95,6 +102,7 @@ public class DocuMineClassificationService {
|
|||||||
&& (textBlock.getMostPopularWordHeight() > document.getTextHeightCounter().getMostPopular()
|
&& (textBlock.getMostPopularWordHeight() > document.getTextHeightCounter().getMostPopular()
|
||||||
|| textBlock.getMostPopularWordFontSize() > document.getFontSizeCounter().getMostPopular())
|
|| textBlock.getMostPopularWordFontSize() > document.getFontSizeCounter().getMostPopular())
|
||||||
&& PositionUtils.getApproxLineCount(textBlock) < 5.9
|
&& PositionUtils.getApproxLineCount(textBlock) < 5.9
|
||||||
|
|
||||||
&& (textBlock.getMostPopularWordStyle().contains("bold") && Character.isDigit(textBlock.toString().charAt(0)) && matcher2.reset().find() && !textBlock.toString()
|
&& (textBlock.getMostPopularWordStyle().contains("bold") && Character.isDigit(textBlock.toString().charAt(0)) && matcher2.reset().find() && !textBlock.toString()
|
||||||
.contains(":")
|
.contains(":")
|
||||||
|| textBlock.toString().equals(textBlock.toString().toUpperCase(Locale.ROOT)) && matcher2.reset().find() && !textBlock.toString().contains(":")
|
|| textBlock.toString().equals(textBlock.toString().toUpperCase(Locale.ROOT)) && matcher2.reset().find() && !textBlock.toString().contains(":")
|
||||||
@ -103,11 +111,13 @@ public class DocuMineClassificationService {
|
|||||||
|| textBlock.toString().startsWith("TABLE"))
|
|| textBlock.toString().startsWith("TABLE"))
|
||||||
&& !textBlock.toString().endsWith(":")
|
&& !textBlock.toString().endsWith(":")
|
||||||
&& matcher2.reset().find()) {
|
&& matcher2.reset().find()) {
|
||||||
textBlock.setClassification(PageBlockType.getHeadlineType(1));
|
PageBlockType headlineType = PageBlockType.getHeadlineType(1);
|
||||||
|
headlineClassificationService.classifyHeadline(textBlock, headlineType);
|
||||||
document.setHeadlines(true);
|
document.setHeadlines(true);
|
||||||
|
|
||||||
} else if (matcher.reset().find() && PositionUtils.getApproxLineCount(textBlock) < 2.9 && matcher2.reset().find() && !matcher3.reset().matches()) {
|
} else if (matcher.reset().find() && PositionUtils.getApproxLineCount(textBlock) < 2.9 && matcher2.reset().find() && !matcher3.reset().matches()) {
|
||||||
textBlock.setClassification(PageBlockType.getHeadlineType(2));
|
PageBlockType headlineType = PageBlockType.getHeadlineType(2);
|
||||||
|
headlineClassificationService.classifyHeadline(textBlock, headlineType);
|
||||||
document.setHeadlines(true);
|
document.setHeadlines(true);
|
||||||
} else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock)
|
} else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock)
|
||||||
&& textBlock.getMostPopularWordFontSize() == document.getFontSizeCounter().getMostPopular()
|
&& textBlock.getMostPopularWordFontSize() == document.getFontSizeCounter().getMostPopular()
|
||||||
|
|||||||
@ -0,0 +1,62 @@
|
|||||||
|
package com.knecon.fforesight.service.layoutparser.processor.services.classification;
|
||||||
|
|
||||||
|
import static com.knecon.fforesight.service.layoutparser.processor.model.PageBlockType.getHeadlineNumber;
|
||||||
|
|
||||||
|
import org.springframework.stereotype.Service;
|
||||||
|
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.model.PageBlockType;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
|
||||||
|
|
||||||
|
import lombok.Getter;
|
||||||
|
import lombok.Setter;
|
||||||
|
|
||||||
|
@Service
|
||||||
|
@Getter
|
||||||
|
@Setter
|
||||||
|
public class HeadlineClassificationService {
|
||||||
|
|
||||||
|
TextPageBlock lastHeadline;
|
||||||
|
PageBlockType originalClassifiedBlockType;
|
||||||
|
TextPageBlock lastHeadlineFromOutline;
|
||||||
|
|
||||||
|
public void resetContext() {
|
||||||
|
setLastHeadline(null);
|
||||||
|
setOriginalClassifiedBlockType(null);
|
||||||
|
setLastHeadlineFromOutline(null);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public void setLastHeadlineFromOutline(TextPageBlock lastHeadlineFromOutline) {
|
||||||
|
|
||||||
|
this.lastHeadlineFromOutline = lastHeadlineFromOutline;
|
||||||
|
this.setLastHeadline(lastHeadlineFromOutline);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public void classifyHeadline(TextPageBlock textBlock, PageBlockType initialHeadlineType) {
|
||||||
|
|
||||||
|
TextPageBlock lastHeadline = getLastHeadline();
|
||||||
|
TextPageBlock lastHeadlineFromOutline = getLastHeadlineFromOutline();
|
||||||
|
PageBlockType originalClassifiedBlockType = getOriginalClassifiedBlockType();
|
||||||
|
PageBlockType finalHeadlineType = initialHeadlineType;
|
||||||
|
|
||||||
|
if (lastHeadline != null) {
|
||||||
|
|
||||||
|
if (lastHeadline.equals(lastHeadlineFromOutline)) {
|
||||||
|
|
||||||
|
finalHeadlineType = PageBlockType.getHeadlineType(getHeadlineNumber(lastHeadline.getClassification()) + 1);
|
||||||
|
|
||||||
|
} else if (originalClassifiedBlockType != null && lastHeadline.getClassification() != originalClassifiedBlockType) {
|
||||||
|
|
||||||
|
PageBlockType lastHeadlineType = lastHeadline.getClassification();
|
||||||
|
int difference = getHeadlineNumber(originalClassifiedBlockType) - getHeadlineNumber(lastHeadlineType);
|
||||||
|
finalHeadlineType = PageBlockType.getHeadlineType(getHeadlineNumber(initialHeadlineType) - difference);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
setOriginalClassifiedBlockType(initialHeadlineType);
|
||||||
|
textBlock.setClassification(finalHeadlineType);
|
||||||
|
setLastHeadline(textBlock);
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
@ -22,12 +22,17 @@ import lombok.extern.slf4j.Slf4j;
|
|||||||
@RequiredArgsConstructor
|
@RequiredArgsConstructor
|
||||||
public class RedactManagerClassificationService {
|
public class RedactManagerClassificationService {
|
||||||
|
|
||||||
|
private final HeadlineClassificationService headlineClassificationService;
|
||||||
|
|
||||||
|
|
||||||
public void classifyDocument(ClassificationDocument document) {
|
public void classifyDocument(ClassificationDocument document) {
|
||||||
|
|
||||||
List<Float> headlineFontSizes = document.getFontSizeCounter().getHighterThanMostPopular();
|
List<Float> headlineFontSizes = document.getFontSizeCounter().getHighterThanMostPopular();
|
||||||
|
|
||||||
log.debug("Document FontSize counters are: {}", document.getFontSizeCounter().getCountPerValue());
|
log.debug("Document FontSize counters are: {}", document.getFontSizeCounter().getCountPerValue());
|
||||||
|
|
||||||
|
headlineClassificationService.resetContext();
|
||||||
|
|
||||||
for (ClassificationPage page : document.getPages()) {
|
for (ClassificationPage page : document.getPages()) {
|
||||||
classifyPage(page, document, headlineFontSizes);
|
classifyPage(page, document, headlineFontSizes);
|
||||||
}
|
}
|
||||||
@ -48,6 +53,10 @@ public class RedactManagerClassificationService {
|
|||||||
|
|
||||||
var bodyTextFrame = page.getBodyTextFrame();
|
var bodyTextFrame = page.getBodyTextFrame();
|
||||||
|
|
||||||
|
if (textBlock.getClassification() != null && textBlock.getClassification().isHeadline()) {
|
||||||
|
headlineClassificationService.setLastHeadlineFromOutline(textBlock);
|
||||||
|
return;
|
||||||
|
}
|
||||||
if (document.getFontSizeCounter().getMostPopular() == null) {
|
if (document.getFontSizeCounter().getMostPopular() == null) {
|
||||||
textBlock.setClassification(PageBlockType.OTHER);
|
textBlock.setClassification(PageBlockType.OTHER);
|
||||||
return;
|
return;
|
||||||
@ -60,58 +69,64 @@ public class RedactManagerClassificationService {
|
|||||||
textBlock.setClassification(PageBlockType.PARAGRAPH);
|
textBlock.setClassification(PageBlockType.PARAGRAPH);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
if (MarkedContentUtils.intersects(textBlock, page.getMarkedContentBboxPerType(), MarkedContentUtils.HEADER)
|
||||||
if (MarkedContentUtils.intersects(textBlock, page.getMarkedContentBboxPerType(), MarkedContentUtils.HEADER) || PositionUtils.isOverBodyTextFrame(bodyTextFrame,
|
|| PositionUtils.isOverBodyTextFrame(bodyTextFrame, textBlock, page.getRotation()) && (document.getFontSizeCounter().getMostPopular() == null
|
||||||
textBlock,
|
|| textBlock.getHighestFontSize() <= document.getFontSizeCounter()
|
||||||
page.getRotation()) && (document.getFontSizeCounter().getMostPopular() == null || textBlock.getHighestFontSize() <= document.getFontSizeCounter()
|
|
||||||
.getMostPopular())) {
|
.getMostPopular())) {
|
||||||
textBlock.setClassification(PageBlockType.HEADER);
|
textBlock.setClassification(PageBlockType.HEADER);
|
||||||
|
|
||||||
} else if (MarkedContentUtils.intersects(textBlock, page.getMarkedContentBboxPerType(), MarkedContentUtils.FOOTER) || PositionUtils.isUnderBodyTextFrame(bodyTextFrame,
|
} else if (MarkedContentUtils.intersects(textBlock, page.getMarkedContentBboxPerType(), MarkedContentUtils.FOOTER)
|
||||||
textBlock,
|
|| PositionUtils.isUnderBodyTextFrame(bodyTextFrame, textBlock, page.getRotation()) && (document.getFontSizeCounter().getMostPopular() == null
|
||||||
page.getRotation()) && (document.getFontSizeCounter().getMostPopular() == null || textBlock.getHighestFontSize() <= document.getFontSizeCounter()
|
|| textBlock.getHighestFontSize() <= document.getFontSizeCounter()
|
||||||
.getMostPopular())) {
|
.getMostPopular())) {
|
||||||
textBlock.setClassification(PageBlockType.FOOTER);
|
textBlock.setClassification(PageBlockType.FOOTER);
|
||||||
} else if (page.getPageNumber() == 1 && (PositionUtils.getHeightDifferenceBetweenChunkWordAndDocumentWord(textBlock,
|
} else if (page.getPageNumber() == 1 && (PositionUtils.getHeightDifferenceBetweenChunkWordAndDocumentWord(textBlock, document.getTextHeightCounter().getMostPopular()) > 2.5
|
||||||
document.getTextHeightCounter().getMostPopular()) > 2.5 && textBlock.getHighestFontSize() > document.getFontSizeCounter().getMostPopular() || page.getTextBlocks()
|
&& textBlock.getHighestFontSize() > document.getFontSizeCounter().getMostPopular() || page.getTextBlocks().size() == 1)) {
|
||||||
.size() == 1)) {
|
|
||||||
if (!Pattern.matches("[0-9]+", textBlock.toString())) {
|
if (!Pattern.matches("[0-9]+", textBlock.toString())) {
|
||||||
textBlock.setClassification(PageBlockType.TITLE);
|
textBlock.setClassification(PageBlockType.TITLE);
|
||||||
}
|
}
|
||||||
} else if (textBlock.getMostPopularWordFontSize() > document.getFontSizeCounter()
|
} else if (textBlock.getMostPopularWordFontSize() > document.getFontSizeCounter().getMostPopular()
|
||||||
.getMostPopular() && PositionUtils.getApproxLineCount(textBlock) < 4.9 && (textBlock.getMostPopularWordStyle().equals("bold") || !document.getFontStyleCounter()
|
&& PositionUtils.getApproxLineCount(textBlock) < 4.9
|
||||||
.getCountPerValue()
|
&& (textBlock.getMostPopularWordStyle().equals("bold")
|
||||||
.containsKey("bold") && textBlock.getMostPopularWordFontSize() > document.getFontSizeCounter().getMostPopular() + 1) && textBlock.getSequences()
|
|| !document.getFontStyleCounter().getCountPerValue().containsKey("bold")
|
||||||
.get(0)
|
&& textBlock.getMostPopularWordFontSize() > document.getFontSizeCounter().getMostPopular() + 1)
|
||||||
.getTextPositions()
|
&& textBlock.getSequences()
|
||||||
.get(0)
|
.get(0).getTextPositions()
|
||||||
.getFontSizeInPt() >= textBlock.getMostPopularWordFontSize()) {
|
.get(0).getFontSizeInPt() >= textBlock.getMostPopularWordFontSize()) {
|
||||||
|
|
||||||
for (int i = 1; i <= headlineFontSizes.size(); i++) {
|
for (int i = 1; i <= headlineFontSizes.size(); i++) {
|
||||||
if (textBlock.getMostPopularWordFontSize() == headlineFontSizes.get(i - 1)) {
|
if (textBlock.getMostPopularWordFontSize() == headlineFontSizes.get(i - 1)) {
|
||||||
textBlock.setClassification(PageBlockType.getHeadlineType(i));
|
PageBlockType headlineType = PageBlockType.getHeadlineType(i);
|
||||||
|
headlineClassificationService.classifyHeadline(textBlock, headlineType);
|
||||||
document.setHeadlines(true);
|
document.setHeadlines(true);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
} else if (!textBlock.getText().startsWith("Figure ") && PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock) && textBlock.getMostPopularWordStyle()
|
} else if (!textBlock.getText().startsWith("Figure ")
|
||||||
.equals("bold") && !document.getFontStyleCounter().getMostPopular().equals("bold") && PositionUtils.getApproxLineCount(textBlock) < 2.9 && textBlock.getSequences()
|
&& PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock)
|
||||||
.get(0)
|
&& textBlock.getMostPopularWordStyle().equals("bold")
|
||||||
.getTextPositions()
|
&& !document.getFontStyleCounter().getMostPopular().equals("bold")
|
||||||
.get(0)
|
&& PositionUtils.getApproxLineCount(textBlock) < 2.9
|
||||||
.getFontSizeInPt() >= textBlock.getMostPopularWordFontSize()) {
|
&& textBlock.getSequences()
|
||||||
textBlock.setClassification(PageBlockType.getHeadlineType(headlineFontSizes.size() + 1));
|
.get(0).getTextPositions()
|
||||||
|
.get(0).getFontSizeInPt() >= textBlock.getMostPopularWordFontSize()) {
|
||||||
|
PageBlockType headlineType = PageBlockType.getHeadlineType(headlineFontSizes.size() + 1);
|
||||||
|
headlineClassificationService.classifyHeadline(textBlock, headlineType);
|
||||||
document.setHeadlines(true);
|
document.setHeadlines(true);
|
||||||
} else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock) && textBlock.getMostPopularWordFontSize() == document.getFontSizeCounter()
|
} else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock)
|
||||||
.getMostPopular() && textBlock.getMostPopularWordStyle().equals("bold") && !document.getFontStyleCounter().getMostPopular().equals("bold")) {
|
&& textBlock.getMostPopularWordFontSize() == document.getFontSizeCounter().getMostPopular()
|
||||||
|
&& textBlock.getMostPopularWordStyle().equals("bold")
|
||||||
|
&& !document.getFontStyleCounter().getMostPopular().equals("bold")) {
|
||||||
textBlock.setClassification(PageBlockType.PARAGRAPH_BOLD);
|
textBlock.setClassification(PageBlockType.PARAGRAPH_BOLD);
|
||||||
} else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock) && textBlock.getMostPopularWordFont()
|
} else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock)
|
||||||
.equals(document.getFontCounter().getMostPopular()) && textBlock.getMostPopularWordStyle()
|
&& textBlock.getMostPopularWordFont().equals(document.getFontCounter().getMostPopular())
|
||||||
.equals(document.getFontStyleCounter().getMostPopular()) && textBlock.getMostPopularWordFontSize() == document.getFontSizeCounter().getMostPopular()) {
|
&& textBlock.getMostPopularWordStyle().equals(document.getFontStyleCounter().getMostPopular())
|
||||||
|
&& textBlock.getMostPopularWordFontSize() == document.getFontSizeCounter().getMostPopular()) {
|
||||||
textBlock.setClassification(PageBlockType.PARAGRAPH);
|
textBlock.setClassification(PageBlockType.PARAGRAPH);
|
||||||
} else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock) && textBlock.getMostPopularWordFontSize() == document.getFontSizeCounter()
|
} else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock)
|
||||||
.getMostPopular() && textBlock.getMostPopularWordStyle().equals("italic") && !document.getFontStyleCounter()
|
&& textBlock.getMostPopularWordFontSize() == document.getFontSizeCounter().getMostPopular()
|
||||||
.getMostPopular()
|
&& textBlock.getMostPopularWordStyle().equals("italic")
|
||||||
.equals("italic") && PositionUtils.getApproxLineCount(textBlock) < 2.9) {
|
&& !document.getFontStyleCounter().getMostPopular().equals("italic")
|
||||||
|
&& PositionUtils.getApproxLineCount(textBlock) < 2.9) {
|
||||||
textBlock.setClassification(PageBlockType.PARAGRAPH_ITALIC);
|
textBlock.setClassification(PageBlockType.PARAGRAPH_ITALIC);
|
||||||
} else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock)) {
|
} else if (PositionUtils.isWithinBodyTextFrame(bodyTextFrame, textBlock)) {
|
||||||
textBlock.setClassification(PageBlockType.PARAGRAPH_UNKNOWN);
|
textBlock.setClassification(PageBlockType.PARAGRAPH_UNKNOWN);
|
||||||
|
|||||||
@ -11,6 +11,7 @@ import java.util.LinkedList;
|
|||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
import java.util.NoSuchElementException;
|
import java.util.NoSuchElementException;
|
||||||
|
import java.util.Optional;
|
||||||
import java.util.Set;
|
import java.util.Set;
|
||||||
import java.util.stream.Collectors;
|
import java.util.stream.Collectors;
|
||||||
|
|
||||||
@ -31,8 +32,10 @@ import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Im
|
|||||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Page;
|
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Page;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Paragraph;
|
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Paragraph;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Section;
|
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Section;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.SectionNode;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.AtomicTextBlock;
|
import com.knecon.fforesight.service.layoutparser.processor.model.graph.textblock.AtomicTextBlock;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.image.ClassifiedImage;
|
import com.knecon.fforesight.service.layoutparser.processor.model.image.ClassifiedImage;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.model.outline.TableOfContentItem;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
|
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.utils.IdBuilder;
|
import com.knecon.fforesight.service.layoutparser.processor.utils.IdBuilder;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.utils.TextPositionOperations;
|
import com.knecon.fforesight.service.layoutparser.processor.utils.TextPositionOperations;
|
||||||
@ -57,11 +60,6 @@ public class DocumentGraphFactory {
|
|||||||
|
|
||||||
document.getPages()
|
document.getPages()
|
||||||
.forEach(context::buildAndAddPageWithCounter);
|
.forEach(context::buildAndAddPageWithCounter);
|
||||||
document.getSections()
|
|
||||||
.stream()
|
|
||||||
.flatMap(section -> section.getImages()
|
|
||||||
.stream())
|
|
||||||
.forEach(image -> context.getImages().add(image));
|
|
||||||
addSections(layoutParsingType, document, context, documentGraph);
|
addSections(layoutParsingType, document, context, documentGraph);
|
||||||
addHeaderAndFooterToEachPage(document, context);
|
addHeaderAndFooterToEachPage(document, context);
|
||||||
|
|
||||||
@ -75,8 +73,17 @@ public class DocumentGraphFactory {
|
|||||||
|
|
||||||
private void addSections(LayoutParsingType layoutParsingType, ClassificationDocument classificationDocument, Context context, Document document) {
|
private void addSections(LayoutParsingType layoutParsingType, ClassificationDocument classificationDocument, Context context, Document document) {
|
||||||
|
|
||||||
classificationDocument.getSections()
|
for (TableOfContentItem tocItem : classificationDocument.getTableOfContents()) {
|
||||||
.forEach(section -> SectionNodeFactory.addSection(layoutParsingType, null, section.getNonEmptyPageBlocks(), section.getImages(), context, document));
|
var parent = tocItem.getParent() == null ? null : tocItem.getParent().getSection();
|
||||||
|
Optional<SectionNode> section = SectionNodeFactory.addSection(layoutParsingType,
|
||||||
|
parent,
|
||||||
|
tocItem.getChildren().isEmpty(),
|
||||||
|
tocItem.getNonEmptySectionBlocks(),
|
||||||
|
tocItem.getImages(),
|
||||||
|
context,
|
||||||
|
document);
|
||||||
|
tocItem.setSection(section.orElse(null));
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@ -181,10 +188,7 @@ public class DocumentGraphFactory {
|
|||||||
|
|
||||||
Page page = context.getPage(textBlocks.get(0).getPage());
|
Page page = context.getPage(textBlocks.get(0).getPage());
|
||||||
Footer footer = Footer.builder().documentTree(context.getDocumentTree()).build();
|
Footer footer = Footer.builder().documentTree(context.getDocumentTree()).build();
|
||||||
AtomicTextBlock textBlock = context.textBlockFactory.buildAtomicTextBlock(TextPositionOperations.mergeTextPositionSequence(textBlocks),
|
AtomicTextBlock textBlock = context.textBlockFactory.buildAtomicTextBlock(TextPositionOperations.mergeTextPositionSequence(textBlocks), footer, context, page);
|
||||||
footer,
|
|
||||||
context,
|
|
||||||
page);
|
|
||||||
List<Integer> tocId = context.getDocumentTree().createNewMainEntryAndReturnId(footer);
|
List<Integer> tocId = context.getDocumentTree().createNewMainEntryAndReturnId(footer);
|
||||||
footer.setTreeId(tocId);
|
footer.setTreeId(tocId);
|
||||||
footer.setLeafTextBlock(textBlock);
|
footer.setLeafTextBlock(textBlock);
|
||||||
@ -236,7 +240,7 @@ public class DocumentGraphFactory {
|
|||||||
|
|
||||||
DocumentTree documentTree;
|
DocumentTree documentTree;
|
||||||
Map<Page, Integer> pages;
|
Map<Page, Integer> pages;
|
||||||
List<Section> sections;
|
List<SectionNode> sections;
|
||||||
List<ClassifiedImage> images;
|
List<ClassifiedImage> images;
|
||||||
TextBlockFactory textBlockFactory;
|
TextBlockFactory textBlockFactory;
|
||||||
|
|
||||||
|
|||||||
@ -9,6 +9,7 @@ import java.util.HashSet;
|
|||||||
import java.util.LinkedList;
|
import java.util.LinkedList;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
|
import java.util.Optional;
|
||||||
import java.util.Set;
|
import java.util.Set;
|
||||||
|
|
||||||
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType;
|
import com.knecon.fforesight.service.layoutparser.internal.api.queue.LayoutParsingType;
|
||||||
@ -17,6 +18,8 @@ import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Do
|
|||||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.GenericSemanticNode;
|
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.GenericSemanticNode;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Page;
|
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Page;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Section;
|
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Section;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.SectionNode;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.SuperSection;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.image.ClassifiedImage;
|
import com.knecon.fforesight.service.layoutparser.processor.model.image.ClassifiedImage;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.table.TablePageBlock;
|
import com.knecon.fforesight.service.layoutparser.processor.model.table.TablePageBlock;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
|
import com.knecon.fforesight.service.layoutparser.processor.model.text.TextPageBlock;
|
||||||
@ -27,12 +30,13 @@ import lombok.experimental.UtilityClass;
|
|||||||
@UtilityClass
|
@UtilityClass
|
||||||
public class SectionNodeFactory {
|
public class SectionNodeFactory {
|
||||||
|
|
||||||
public void addSection(LayoutParsingType layoutParsingType,
|
public Optional<SectionNode> addSection(LayoutParsingType layoutParsingType,
|
||||||
GenericSemanticNode parentNode,
|
GenericSemanticNode parentNode,
|
||||||
List<AbstractPageBlock> pageBlocks,
|
boolean isLeaf,
|
||||||
List<ClassifiedImage> images,
|
List<AbstractPageBlock> pageBlocks,
|
||||||
DocumentGraphFactory.Context context,
|
List<ClassifiedImage> images,
|
||||||
Document document) {
|
DocumentGraphFactory.Context context,
|
||||||
|
Document document) {
|
||||||
|
|
||||||
// This is for the case where we have images on a page without any text/footer/header.
|
// This is for the case where we have images on a page without any text/footer/header.
|
||||||
// The pageBlocks list is empty, but we still need to add those images to the document.
|
// The pageBlocks list is empty, but we still need to add those images to the document.
|
||||||
@ -40,16 +44,22 @@ public class SectionNodeFactory {
|
|||||||
images.stream()
|
images.stream()
|
||||||
.distinct()
|
.distinct()
|
||||||
.forEach(image -> DocumentGraphFactory.addImage(document, image, context));
|
.forEach(image -> DocumentGraphFactory.addImage(document, image, context));
|
||||||
return;
|
return Optional.empty();
|
||||||
}
|
}
|
||||||
|
|
||||||
if (pageBlocks.isEmpty()) {
|
if (pageBlocks.isEmpty()) {
|
||||||
return;
|
return Optional.empty();
|
||||||
}
|
}
|
||||||
|
|
||||||
Map<Integer, List<AbstractPageBlock>> blocksPerPage = pageBlocks.stream()
|
Map<Integer, List<AbstractPageBlock>> blocksPerPage = pageBlocks.stream()
|
||||||
.collect(groupingBy(AbstractPageBlock::getPage));
|
.collect(groupingBy(AbstractPageBlock::getPage));
|
||||||
Section section = Section.builder().documentTree(context.getDocumentTree()).build();
|
|
||||||
|
SectionNode section;
|
||||||
|
if (isLeaf) {
|
||||||
|
section = Section.builder().documentTree(context.getDocumentTree()).build();
|
||||||
|
} else {
|
||||||
|
section = SuperSection.builder().documentTree(context.getDocumentTree()).build();
|
||||||
|
}
|
||||||
|
|
||||||
context.getSections().add(section);
|
context.getSections().add(section);
|
||||||
blocksPerPage.keySet()
|
blocksPerPage.keySet()
|
||||||
@ -59,12 +69,24 @@ public class SectionNodeFactory {
|
|||||||
|
|
||||||
addFirstHeadlineDirectlyToSection(layoutParsingType, pageBlocks, context, section, document);
|
addFirstHeadlineDirectlyToSection(layoutParsingType, pageBlocks, context, section, document);
|
||||||
if (containsTablesAndTextBlocks(pageBlocks)) {
|
if (containsTablesAndTextBlocks(pageBlocks)) {
|
||||||
|
|
||||||
|
if (pageBlocks.get(0).isHeadline()) {
|
||||||
|
pageBlocks.remove(0);
|
||||||
|
}
|
||||||
|
|
||||||
splitPageBlocksIntoSubSections(pageBlocks).forEach(subSectionPageBlocks -> addSection(layoutParsingType,
|
splitPageBlocksIntoSubSections(pageBlocks).forEach(subSectionPageBlocks -> addSection(layoutParsingType,
|
||||||
section,
|
section,
|
||||||
|
true,
|
||||||
subSectionPageBlocks,
|
subSectionPageBlocks,
|
||||||
emptyList(),
|
emptyList(),
|
||||||
context,
|
context,
|
||||||
document));
|
document));
|
||||||
|
} else if (!isLeaf) {
|
||||||
|
|
||||||
|
if (pageBlocks.get(0).isHeadline()) {
|
||||||
|
pageBlocks.remove(0);
|
||||||
|
}
|
||||||
|
addSection(layoutParsingType, section, true, pageBlocks, emptyList(), context, document);
|
||||||
} else {
|
} else {
|
||||||
addTablesAndParagraphsAndHeadlinesToSection(layoutParsingType, pageBlocks, context, section, document);
|
addTablesAndParagraphsAndHeadlinesToSection(layoutParsingType, pageBlocks, context, section, document);
|
||||||
}
|
}
|
||||||
@ -72,10 +94,12 @@ public class SectionNodeFactory {
|
|||||||
images.stream()
|
images.stream()
|
||||||
.distinct()
|
.distinct()
|
||||||
.forEach(image -> DocumentGraphFactory.addImage(section, image, context));
|
.forEach(image -> DocumentGraphFactory.addImage(section, image, context));
|
||||||
|
|
||||||
|
return Optional.of(section);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
private List<Integer> getTreeId(GenericSemanticNode parentNode, DocumentGraphFactory.Context context, Section section) {
|
private List<Integer> getTreeId(GenericSemanticNode parentNode, DocumentGraphFactory.Context context, SectionNode section) {
|
||||||
|
|
||||||
if (parentNode == null) {
|
if (parentNode == null) {
|
||||||
return context.getDocumentTree().createNewMainEntryAndReturnId(section);
|
return context.getDocumentTree().createNewMainEntryAndReturnId(section);
|
||||||
@ -88,7 +112,7 @@ public class SectionNodeFactory {
|
|||||||
private void addFirstHeadlineDirectlyToSection(LayoutParsingType layoutParsingType,
|
private void addFirstHeadlineDirectlyToSection(LayoutParsingType layoutParsingType,
|
||||||
List<AbstractPageBlock> pageBlocks,
|
List<AbstractPageBlock> pageBlocks,
|
||||||
DocumentGraphFactory.Context context,
|
DocumentGraphFactory.Context context,
|
||||||
Section section,
|
SectionNode section,
|
||||||
Document document) {
|
Document document) {
|
||||||
|
|
||||||
if (pageBlocks.get(0).isHeadline()) {
|
if (pageBlocks.get(0).isHeadline()) {
|
||||||
@ -101,7 +125,7 @@ public class SectionNodeFactory {
|
|||||||
private void addTablesAndParagraphsAndHeadlinesToSection(LayoutParsingType layoutParsingType,
|
private void addTablesAndParagraphsAndHeadlinesToSection(LayoutParsingType layoutParsingType,
|
||||||
List<AbstractPageBlock> pageBlocks,
|
List<AbstractPageBlock> pageBlocks,
|
||||||
DocumentGraphFactory.Context context,
|
DocumentGraphFactory.Context context,
|
||||||
Section section,
|
SectionNode section,
|
||||||
Document document) {
|
Document document) {
|
||||||
|
|
||||||
Set<AbstractPageBlock> alreadyMerged = new HashSet<>();
|
Set<AbstractPageBlock> alreadyMerged = new HashSet<>();
|
||||||
@ -226,7 +250,7 @@ public class SectionNodeFactory {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
private void addSectionNodeToPageNode(DocumentGraphFactory.Context context, Section section, Integer pageNumber) {
|
private void addSectionNodeToPageNode(DocumentGraphFactory.Context context, SectionNode section, Integer pageNumber) {
|
||||||
|
|
||||||
Page page = context.getPage(pageNumber);
|
Page page = context.getPage(pageNumber);
|
||||||
page.getMainBody().add(section);
|
page.getMainBody().add(section);
|
||||||
|
|||||||
@ -12,6 +12,7 @@ import com.knecon.fforesight.service.layoutparser.processor.model.AbstractPageBl
|
|||||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document;
|
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Document;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.GenericSemanticNode;
|
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.GenericSemanticNode;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Page;
|
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Page;
|
||||||
|
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Section;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.SemanticNode;
|
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.SemanticNode;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Table;
|
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.Table;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.TableCell;
|
import com.knecon.fforesight.service.layoutparser.processor.model.graph.nodes.TableCell;
|
||||||
@ -154,10 +155,11 @@ public class TableNodeFactory {
|
|||||||
} else if (firstTextBlockIsHeadline(cell)) {
|
} else if (firstTextBlockIsHeadline(cell)) {
|
||||||
SectionNodeFactory.addSection(layoutParsingType,
|
SectionNodeFactory.addSection(layoutParsingType,
|
||||||
tableCell,
|
tableCell,
|
||||||
|
true,
|
||||||
cell.getTextBlocks()
|
cell.getTextBlocks()
|
||||||
.stream()
|
.stream()
|
||||||
.map(tb -> (AbstractPageBlock) tb)
|
.map(tb -> (AbstractPageBlock) tb)
|
||||||
.toList(),
|
.collect(Collectors.toList()),
|
||||||
emptyList(),
|
emptyList(),
|
||||||
context,
|
context,
|
||||||
document);
|
document);
|
||||||
|
|||||||
@ -61,7 +61,7 @@ public class DocumentGraphMapper {
|
|||||||
List<Page> pages = Arrays.stream(entryData.getPageNumbers()).map(pageNumber -> getPage(pageNumber, context)).toList();
|
List<Page> pages = Arrays.stream(entryData.getPageNumbers()).map(pageNumber -> getPage(pageNumber, context)).toList();
|
||||||
|
|
||||||
SemanticNode node = switch (entryData.getType()) {
|
SemanticNode node = switch (entryData.getType()) {
|
||||||
case SECTION -> buildSection(context);
|
case SECTION, SUPER_SECTION -> buildSection(context);
|
||||||
case PARAGRAPH -> buildParagraph(context, entryData.getProperties());
|
case PARAGRAPH -> buildParagraph(context, entryData.getProperties());
|
||||||
case HEADLINE -> buildHeadline(context);
|
case HEADLINE -> buildHeadline(context);
|
||||||
case HEADER -> buildHeader(context);
|
case HEADER -> buildHeader(context);
|
||||||
|
|||||||
@ -193,10 +193,11 @@ public class LayoutGridService {
|
|||||||
List<SemanticNode> subSections = semanticNode.streamAllSubNodesOfType(NodeType.SECTION)
|
List<SemanticNode> subSections = semanticNode.streamAllSubNodesOfType(NodeType.SECTION)
|
||||||
.toList();
|
.toList();
|
||||||
Page firstPage = semanticNode.getFirstPage();
|
Page firstPage = semanticNode.getFirstPage();
|
||||||
|
String treeIdString = buildTreeIdString(semanticNode);
|
||||||
if (!subSections.isEmpty()) {
|
if (!subSections.isEmpty()) {
|
||||||
addPlacedText(firstPage, bBoxMap.get(firstPage), buildTreeIdString(semanticNode), layoutGrid);
|
addPlacedText(firstPage, bBoxMap.get(firstPage), treeIdString, layoutGrid);
|
||||||
} else {
|
} else {
|
||||||
bBoxMap.forEach(((page, textBBox) -> addPlacedText(page, textBBox, buildTreeIdString(semanticNode), layoutGrid)));
|
bBoxMap.forEach(((page, textBBox) -> addPlacedText(page, textBBox, treeIdString, layoutGrid)));
|
||||||
}
|
}
|
||||||
if (bBoxMap.values().size() == 1) {
|
if (bBoxMap.values().size() == 1) {
|
||||||
Rectangle2D r = RectangleTransformations.pad(bBoxMap.get(firstPage), LINE_WIDTH, LINE_WIDTH);
|
Rectangle2D r = RectangleTransformations.pad(bBoxMap.get(firstPage), LINE_WIDTH, LINE_WIDTH);
|
||||||
|
|||||||
@ -112,8 +112,8 @@ public class PdfVisualisationUtility {
|
|||||||
case DOCUMENT -> Color.LIGHT_GRAY;
|
case DOCUMENT -> Color.LIGHT_GRAY;
|
||||||
case HEADER, FOOTER -> Color.GREEN;
|
case HEADER, FOOTER -> Color.GREEN;
|
||||||
case PARAGRAPH -> Color.BLUE;
|
case PARAGRAPH -> Color.BLUE;
|
||||||
|
case SUPER_SECTION, SECTION -> Color.BLACK;
|
||||||
case HEADLINE -> Color.RED;
|
case HEADLINE -> Color.RED;
|
||||||
case SECTION -> Color.BLACK;
|
|
||||||
case TABLE -> Color.ORANGE;
|
case TABLE -> Color.ORANGE;
|
||||||
case TABLE_CELL -> Color.GRAY;
|
case TABLE_CELL -> Color.GRAY;
|
||||||
case IMAGE -> Color.MAGENTA;
|
case IMAGE -> Color.MAGENTA;
|
||||||
|
|||||||
@ -32,6 +32,7 @@ public class ViewerDocumentTest extends BuildDocumentTest {
|
|||||||
public void testViewerDocument() {
|
public void testViewerDocument() {
|
||||||
|
|
||||||
String fileName = "files/syngenta/CustomerFiles/S-Metolachlor_RAR_01_Volume_1_2018-09-06.pdf";
|
String fileName = "files/syngenta/CustomerFiles/S-Metolachlor_RAR_01_Volume_1_2018-09-06.pdf";
|
||||||
|
|
||||||
String tmpFileName = "/tmp/" + Path.of(fileName).getFileName() + "_VIEWER.pdf";
|
String tmpFileName = "/tmp/" + Path.of(fileName).getFileName() + "_VIEWER.pdf";
|
||||||
|
|
||||||
var documentFile = new ClassPathResource(fileName).getFile();
|
var documentFile = new ClassPathResource(fileName).getFile();
|
||||||
|
|||||||
@ -37,8 +37,6 @@ import com.knecon.fforesight.service.layoutparser.processor.python_api.model.ima
|
|||||||
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableServiceResponse;
|
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.TableServiceResponse;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.VisualLayoutParsingResponse;
|
import com.knecon.fforesight.service.layoutparser.processor.python_api.model.table.VisualLayoutParsingResponse;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.services.PageContentExtractor;
|
import com.knecon.fforesight.service.layoutparser.processor.services.PageContentExtractor;
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.services.SectionsBuilderService;
|
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.services.classification.RedactManagerClassificationService;
|
|
||||||
import com.knecon.fforesight.service.layoutparser.processor.services.factory.DocumentGraphFactory;
|
import com.knecon.fforesight.service.layoutparser.processor.services.factory.DocumentGraphFactory;
|
||||||
import com.knecon.fforesight.service.layoutparser.server.utils.AbstractTest;
|
import com.knecon.fforesight.service.layoutparser.server.utils.AbstractTest;
|
||||||
|
|
||||||
@ -62,6 +60,7 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
|||||||
tableServiceResponse,
|
tableServiceResponse,
|
||||||
new VisualLayoutParsingResponse(),
|
new VisualLayoutParsingResponse(),
|
||||||
Map.of("file", "document"));
|
Map.of("file", "document"));
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@ -134,6 +133,7 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
|||||||
@Test
|
@Test
|
||||||
@SneakyThrows
|
@SneakyThrows
|
||||||
public void testTableAndCellRotations() {
|
public void testTableAndCellRotations() {
|
||||||
|
|
||||||
String fileName = "files/Minimal Examples/simpleTablesRotated.pdf";
|
String fileName = "files/Minimal Examples/simpleTablesRotated.pdf";
|
||||||
ClassPathResource pdfFileResource = new ClassPathResource(fileName);
|
ClassPathResource pdfFileResource = new ClassPathResource(fileName);
|
||||||
|
|
||||||
@ -141,7 +141,6 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
@Disabled
|
@Disabled
|
||||||
@Test
|
@Test
|
||||||
public void testScanRotationBorderIsIgnored() throws IOException {
|
public void testScanRotationBorderIsIgnored() throws IOException {
|
||||||
@ -151,15 +150,19 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
|||||||
var tableServiceResponse = objectMapper.readValue(cvTablesResource.getInputStream(), TableServiceResponse.class);
|
var tableServiceResponse = objectMapper.readValue(cvTablesResource.getInputStream(), TableServiceResponse.class);
|
||||||
|
|
||||||
ClassificationDocument document = buildClassificationDocument(pdfFileResource.getFile(), tableServiceResponse);
|
ClassificationDocument document = buildClassificationDocument(pdfFileResource.getFile(), tableServiceResponse);
|
||||||
assertThat(document.getSections()
|
assertThat(document.getTableOfContents().getAllTableOfContentItems()
|
||||||
.stream()
|
.stream()
|
||||||
.flatMap(paragraph -> paragraph.getTables()
|
.flatMap(tocItem -> tocItem.getSectionBlocks()
|
||||||
.stream())
|
.stream()
|
||||||
.collect(Collectors.toList())).isNotEmpty();
|
.filter(abstractPageBlock -> abstractPageBlock instanceof TablePageBlock))
|
||||||
var tables = document.getSections()
|
.map(abstractPageBlock -> (TablePageBlock) abstractPageBlock)
|
||||||
|
.toList()).isNotEmpty();
|
||||||
|
var tables = document.getTableOfContents().getAllTableOfContentItems()
|
||||||
.stream()
|
.stream()
|
||||||
.flatMap(paragraph -> paragraph.getTables()
|
.flatMap(tocItem -> tocItem.getSectionBlocks()
|
||||||
.stream())
|
.stream()
|
||||||
|
.filter(abstractPageBlock -> abstractPageBlock instanceof TablePageBlock))
|
||||||
|
.map(abstractPageBlock -> (TablePageBlock) abstractPageBlock)
|
||||||
.toList();
|
.toList();
|
||||||
|
|
||||||
// Quality of the table parsing is not good, because the file is rotated at scanning.
|
// Quality of the table parsing is not good, because the file is rotated at scanning.
|
||||||
@ -199,15 +202,19 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
|||||||
ClassPathResource pdfFileResource = new ClassPathResource("files/syngenta/CustomerFiles/SinglePages/Spanning Cells - Page131_S-Metolachlor_RAR_01_Volume_1_2018-09-06.pdf");
|
ClassPathResource pdfFileResource = new ClassPathResource("files/syngenta/CustomerFiles/SinglePages/Spanning Cells - Page131_S-Metolachlor_RAR_01_Volume_1_2018-09-06.pdf");
|
||||||
|
|
||||||
ClassificationDocument document = buildClassificationDocument(pdfFileResource.getFile());
|
ClassificationDocument document = buildClassificationDocument(pdfFileResource.getFile());
|
||||||
assertThat(document.getSections()
|
assertThat(document.getTableOfContents().getAllTableOfContentItems()
|
||||||
.stream()
|
.stream()
|
||||||
.flatMap(paragraph -> paragraph.getTables()
|
.flatMap(tocItem -> tocItem.getSectionBlocks()
|
||||||
.stream())
|
.stream()
|
||||||
.collect(Collectors.toList())).isNotEmpty();
|
.filter(abstractPageBlock -> abstractPageBlock instanceof TablePageBlock))
|
||||||
TablePageBlock table = document.getSections()
|
.map(abstractPageBlock -> (TablePageBlock) abstractPageBlock)
|
||||||
|
.toList()).isNotEmpty();
|
||||||
|
TablePageBlock table = document.getTableOfContents().getAllTableOfContentItems()
|
||||||
.stream()
|
.stream()
|
||||||
.flatMap(paragraph -> paragraph.getTables()
|
.flatMap(tocItem -> tocItem.getSectionBlocks()
|
||||||
.stream())
|
.stream()
|
||||||
|
.filter(abstractPageBlock -> abstractPageBlock instanceof TablePageBlock))
|
||||||
|
.map(abstractPageBlock -> (TablePageBlock) abstractPageBlock)
|
||||||
.toList()
|
.toList()
|
||||||
.get(0);
|
.get(0);
|
||||||
assertThat(table.getColCount()).isEqualTo(6);
|
assertThat(table.getColCount()).isEqualTo(6);
|
||||||
@ -225,23 +232,29 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
|||||||
"files/syngenta/CustomerFiles/SinglePages/Merge Table - Page5_26 A8637C - EU AIR3 - LCP Section 10 - Ecotoxicological studies on the plant protection product - Reference list.pdf");
|
"files/syngenta/CustomerFiles/SinglePages/Merge Table - Page5_26 A8637C - EU AIR3 - LCP Section 10 - Ecotoxicological studies on the plant protection product - Reference list.pdf");
|
||||||
|
|
||||||
ClassificationDocument document = buildClassificationDocument(pdfFileResource.getFile());
|
ClassificationDocument document = buildClassificationDocument(pdfFileResource.getFile());
|
||||||
assertThat(document.getSections()
|
assertThat(document.getTableOfContents().getAllTableOfContentItems()
|
||||||
.stream()
|
.stream()
|
||||||
.flatMap(paragraph -> paragraph.getTables()
|
.flatMap(tocItem -> tocItem.getSectionBlocks()
|
||||||
.stream())
|
.stream()
|
||||||
.collect(Collectors.toList())).isNotEmpty();
|
.filter(abstractPageBlock -> abstractPageBlock instanceof TablePageBlock))
|
||||||
TablePageBlock firstTable = document.getSections()
|
.map(abstractPageBlock -> (TablePageBlock) abstractPageBlock)
|
||||||
|
.toList()).isNotEmpty();
|
||||||
|
TablePageBlock firstTable = document.getTableOfContents().getAllTableOfContentItems()
|
||||||
.stream()
|
.stream()
|
||||||
.flatMap(paragraph -> paragraph.getTables()
|
.flatMap(tocItem -> tocItem.getSectionBlocks()
|
||||||
.stream())
|
.stream()
|
||||||
|
.filter(abstractPageBlock -> abstractPageBlock instanceof TablePageBlock))
|
||||||
|
.map(abstractPageBlock -> (TablePageBlock) abstractPageBlock)
|
||||||
.toList()
|
.toList()
|
||||||
.get(0);
|
.get(0);
|
||||||
assertThat(firstTable.getColCount()).isEqualTo(8);
|
assertThat(firstTable.getColCount()).isEqualTo(8);
|
||||||
assertThat(firstTable.getRowCount()).isEqualTo(1);
|
assertThat(firstTable.getRowCount()).isEqualTo(1);
|
||||||
TablePageBlock secondTable = document.getSections()
|
TablePageBlock secondTable = document.getTableOfContents().getAllTableOfContentItems()
|
||||||
.stream()
|
.stream()
|
||||||
.flatMap(paragraph -> paragraph.getTables()
|
.flatMap(tocItem -> tocItem.getSectionBlocks()
|
||||||
.stream())
|
.stream()
|
||||||
|
.filter(abstractPageBlock -> abstractPageBlock instanceof TablePageBlock))
|
||||||
|
.map(abstractPageBlock -> (TablePageBlock) abstractPageBlock)
|
||||||
.toList()
|
.toList()
|
||||||
.get(1);
|
.get(1);
|
||||||
assertThat(secondTable.getColCount()).isEqualTo(8);
|
assertThat(secondTable.getColCount()).isEqualTo(8);
|
||||||
@ -266,23 +279,29 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
|||||||
"files/syngenta/CustomerFiles/SinglePages/Merge Multi Page Table - Page4_Page5_51 Fludioxonil_RAR_02_Volume_2_2018-02-21.pdf");
|
"files/syngenta/CustomerFiles/SinglePages/Merge Multi Page Table - Page4_Page5_51 Fludioxonil_RAR_02_Volume_2_2018-02-21.pdf");
|
||||||
|
|
||||||
ClassificationDocument document = buildClassificationDocument(pdfFileResource.getFile());
|
ClassificationDocument document = buildClassificationDocument(pdfFileResource.getFile());
|
||||||
assertThat(document.getSections()
|
assertThat(document.getTableOfContents().getAllTableOfContentItems()
|
||||||
.stream()
|
.stream()
|
||||||
.flatMap(paragraph -> paragraph.getTables()
|
.flatMap(tocItem -> tocItem.getSectionBlocks()
|
||||||
.stream())
|
.stream()
|
||||||
.collect(Collectors.toList())).isNotEmpty();
|
.filter(abstractPageBlock -> abstractPageBlock instanceof TablePageBlock))
|
||||||
TablePageBlock firstTable = document.getSections()
|
.map(abstractPageBlock -> (TablePageBlock) abstractPageBlock)
|
||||||
|
.toList()).isNotEmpty();
|
||||||
|
TablePageBlock firstTable = document.getTableOfContents().getAllTableOfContentItems()
|
||||||
.stream()
|
.stream()
|
||||||
.flatMap(paragraph -> paragraph.getTables()
|
.flatMap(tocItem -> tocItem.getSectionBlocks()
|
||||||
.stream())
|
.stream()
|
||||||
|
.filter(abstractPageBlock -> abstractPageBlock instanceof TablePageBlock))
|
||||||
|
.map(abstractPageBlock -> (TablePageBlock) abstractPageBlock)
|
||||||
.toList()
|
.toList()
|
||||||
.get(0);
|
.get(0);
|
||||||
assertThat(firstTable.getColCount()).isEqualTo(9);
|
assertThat(firstTable.getColCount()).isEqualTo(9);
|
||||||
assertThat(firstTable.getRowCount()).isEqualTo(5);
|
assertThat(firstTable.getRowCount()).isEqualTo(5);
|
||||||
TablePageBlock secondTable = document.getSections()
|
TablePageBlock secondTable = document.getTableOfContents().getAllTableOfContentItems()
|
||||||
.stream()
|
.stream()
|
||||||
.flatMap(paragraph -> paragraph.getTables()
|
.flatMap(tocItem -> tocItem.getSectionBlocks()
|
||||||
.stream())
|
.stream()
|
||||||
|
.filter(abstractPageBlock -> abstractPageBlock instanceof TablePageBlock))
|
||||||
|
.map(abstractPageBlock -> (TablePageBlock) abstractPageBlock)
|
||||||
.toList()
|
.toList()
|
||||||
.get(1);
|
.get(1);
|
||||||
assertThat(secondTable.getColCount()).isEqualTo(9);
|
assertThat(secondTable.getColCount()).isEqualTo(9);
|
||||||
@ -307,23 +326,29 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
|||||||
"files/syngenta/CustomerFiles/SinglePages/Rotated Table Headers - Page4_65 Mesotrione - EU AIR3 - LCA Section 1 Supplement Reference List.pdf");
|
"files/syngenta/CustomerFiles/SinglePages/Rotated Table Headers - Page4_65 Mesotrione - EU AIR3 - LCA Section 1 Supplement Reference List.pdf");
|
||||||
|
|
||||||
ClassificationDocument document = buildClassificationDocument(pdfFileResource.getFile());
|
ClassificationDocument document = buildClassificationDocument(pdfFileResource.getFile());
|
||||||
assertThat(document.getSections()
|
assertThat(document.getTableOfContents().getAllTableOfContentItems()
|
||||||
.stream()
|
.stream()
|
||||||
.flatMap(paragraph -> paragraph.getTables()
|
.flatMap(tocItem -> tocItem.getSectionBlocks()
|
||||||
.stream())
|
.stream()
|
||||||
.collect(Collectors.toList())).isNotEmpty();
|
.filter(abstractPageBlock -> abstractPageBlock instanceof TablePageBlock))
|
||||||
TablePageBlock firstTable = document.getSections()
|
.map(abstractPageBlock -> (TablePageBlock) abstractPageBlock)
|
||||||
|
.toList()).isNotEmpty();
|
||||||
|
TablePageBlock firstTable = document.getTableOfContents().getAllTableOfContentItems()
|
||||||
.stream()
|
.stream()
|
||||||
.flatMap(paragraph -> paragraph.getTables()
|
.flatMap(tocItem -> tocItem.getSectionBlocks()
|
||||||
.stream())
|
.stream()
|
||||||
|
.filter(abstractPageBlock -> abstractPageBlock instanceof TablePageBlock))
|
||||||
|
.map(abstractPageBlock -> (TablePageBlock) abstractPageBlock)
|
||||||
.toList()
|
.toList()
|
||||||
.get(0);
|
.get(0);
|
||||||
assertThat(firstTable.getColCount()).isEqualTo(8);
|
assertThat(firstTable.getColCount()).isEqualTo(8);
|
||||||
assertThat(firstTable.getRowCount()).isEqualTo(1);
|
assertThat(firstTable.getRowCount()).isEqualTo(1);
|
||||||
TablePageBlock secondTable = document.getSections()
|
TablePageBlock secondTable = document.getTableOfContents().getAllTableOfContentItems()
|
||||||
.stream()
|
.stream()
|
||||||
.flatMap(paragraph -> paragraph.getTables()
|
.flatMap(tocItem -> tocItem.getSectionBlocks()
|
||||||
.stream())
|
.stream()
|
||||||
|
.filter(abstractPageBlock -> abstractPageBlock instanceof TablePageBlock))
|
||||||
|
.map(abstractPageBlock -> (TablePageBlock) abstractPageBlock)
|
||||||
.toList()
|
.toList()
|
||||||
.get(1);
|
.get(1);
|
||||||
assertThat(secondTable.getColCount()).isEqualTo(8);
|
assertThat(secondTable.getColCount()).isEqualTo(8);
|
||||||
@ -818,10 +843,12 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
|||||||
@SneakyThrows
|
@SneakyThrows
|
||||||
private void toHtml(ClassificationDocument document, String filename) {
|
private void toHtml(ClassificationDocument document, String filename) {
|
||||||
|
|
||||||
var tables = document.getSections()
|
var tables = document.getTableOfContents().getAllTableOfContentItems()
|
||||||
.stream()
|
.stream()
|
||||||
.flatMap(paragraph -> paragraph.getTables()
|
.flatMap(tocItem -> tocItem.getSectionBlocks()
|
||||||
.stream())
|
.stream()
|
||||||
|
.filter(abstractPageBlock -> abstractPageBlock instanceof TablePageBlock))
|
||||||
|
.map(abstractPageBlock -> (TablePageBlock) abstractPageBlock)
|
||||||
.toList();
|
.toList();
|
||||||
StringBuilder sb = new StringBuilder();
|
StringBuilder sb = new StringBuilder();
|
||||||
|
|
||||||
@ -843,12 +870,15 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
|||||||
|
|
||||||
private void validateTable(ClassificationDocument document, int tableIndex, int colCount, int rowCount, int emptyCellsCountCorrect, int emptyCellsCountIncorrect) {
|
private void validateTable(ClassificationDocument document, int tableIndex, int colCount, int rowCount, int emptyCellsCountCorrect, int emptyCellsCountIncorrect) {
|
||||||
|
|
||||||
TablePageBlock table = document.getSections()
|
TablePageBlock table = document.getTableOfContents().getAllTableOfContentItems()
|
||||||
.stream()
|
.stream()
|
||||||
.flatMap(paragraph -> paragraph.getTables()
|
.flatMap(tocItem -> tocItem.getSectionBlocks()
|
||||||
.stream())
|
.stream()
|
||||||
|
.filter(abstractPageBlock -> abstractPageBlock instanceof TablePageBlock))
|
||||||
|
.map(abstractPageBlock -> (TablePageBlock) abstractPageBlock)
|
||||||
.toList()
|
.toList()
|
||||||
.get(tableIndex);
|
.get(tableIndex);
|
||||||
|
|
||||||
List<List<Cell>> rows = table.getRows();
|
List<List<Cell>> rows = table.getRows();
|
||||||
int emptyCellsFoundFound = rows.stream()
|
int emptyCellsFoundFound = rows.stream()
|
||||||
.flatMap(List::stream)
|
.flatMap(List::stream)
|
||||||
@ -870,10 +900,12 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
|||||||
|
|
||||||
private void validateTable(ClassificationDocument document, int tableIndex, List<List<String>> values) {
|
private void validateTable(ClassificationDocument document, int tableIndex, List<List<String>> values) {
|
||||||
|
|
||||||
TablePageBlock table = document.getSections()
|
TablePageBlock table = document.getTableOfContents().getAllTableOfContentItems()
|
||||||
.stream()
|
.stream()
|
||||||
.flatMap(paragraph -> paragraph.getTables()
|
.flatMap(tocItem -> tocItem.getSectionBlocks()
|
||||||
.stream())
|
.stream()
|
||||||
|
.filter(abstractPageBlock -> abstractPageBlock instanceof TablePageBlock))
|
||||||
|
.map(abstractPageBlock -> (TablePageBlock) abstractPageBlock)
|
||||||
.toList()
|
.toList()
|
||||||
.get(tableIndex);
|
.get(tableIndex);
|
||||||
List<List<Cell>> rows = table.getRows();
|
List<List<Cell>> rows = table.getRows();
|
||||||
@ -896,10 +928,11 @@ public class PdfSegmentationServiceTest extends AbstractTest {
|
|||||||
|
|
||||||
private void validateTableSize(ClassificationDocument document, int tableSize) {
|
private void validateTableSize(ClassificationDocument document, int tableSize) {
|
||||||
|
|
||||||
assertThat(document.getSections()
|
assertThat(document.getTableOfContents().getAllTableOfContentItems()
|
||||||
.stream()
|
.stream()
|
||||||
.flatMap(paragraph -> paragraph.getTables()
|
.flatMap(tocItem -> tocItem.getSectionBlocks()
|
||||||
.stream())
|
.stream()
|
||||||
|
.filter(abstractPageBlock -> abstractPageBlock instanceof TablePageBlock))
|
||||||
.toList().size()).isEqualTo(tableSize);
|
.toList().size()).isEqualTo(tableSize);
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|||||||
@ -93,6 +93,7 @@ public class RulingCleaningServiceTest extends BuildDocumentTest {
|
|||||||
.toList();
|
.toList();
|
||||||
|
|
||||||
for (String pdfFileName : pdfFileNames) {
|
for (String pdfFileName : pdfFileNames) {
|
||||||
|
|
||||||
writeJsons(Path.of(pdfFileName));
|
writeJsons(Path.of(pdfFileName));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -102,15 +103,15 @@ public class RulingCleaningServiceTest extends BuildDocumentTest {
|
|||||||
@SneakyThrows
|
@SneakyThrows
|
||||||
private void writeJsons(Path filename) {
|
private void writeJsons(Path filename) {
|
||||||
|
|
||||||
Document documentGraphBefore = DocumentGraphFactory.buildDocumentGraph(LayoutParsingType.REDACT_MANAGER_OLD,
|
Document documentGraphBefore = DocumentGraphFactory.buildDocumentGraph(LayoutParsingType.REDACT_MANAGER,
|
||||||
layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER_OLD,
|
layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER,
|
||||||
filename.toFile(),
|
filename.toFile(),
|
||||||
new ImageServiceResponse(),
|
new ImageServiceResponse(),
|
||||||
new TableServiceResponse(),
|
new TableServiceResponse(),
|
||||||
new VisualLayoutParsingResponse(),
|
new VisualLayoutParsingResponse(),
|
||||||
Map.of("file",filename.toFile().toString())));
|
Map.of("file",filename.toFile().toString())));
|
||||||
Document documentGraphAfter = DocumentGraphFactory.buildDocumentGraph(LayoutParsingType.REDACT_MANAGER_OLD,
|
Document documentGraphAfter = DocumentGraphFactory.buildDocumentGraph(LayoutParsingType.REDACT_MANAGER,
|
||||||
layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER_OLD,
|
layoutParsingPipeline.parseLayout(LayoutParsingType.REDACT_MANAGER,
|
||||||
filename.toFile(),
|
filename.toFile(),
|
||||||
new ImageServiceResponse(),
|
new ImageServiceResponse(),
|
||||||
new TableServiceResponse(),
|
new TableServiceResponse(),
|
||||||
|
|||||||
@ -229,7 +229,7 @@ public class PdfDraw {
|
|||||||
case HEADER, FOOTER -> Color.GREEN;
|
case HEADER, FOOTER -> Color.GREEN;
|
||||||
case PARAGRAPH -> Color.BLUE;
|
case PARAGRAPH -> Color.BLUE;
|
||||||
case HEADLINE -> Color.RED;
|
case HEADLINE -> Color.RED;
|
||||||
case SECTION -> Color.BLACK;
|
case SECTION, SUPER_SECTION -> Color.BLACK;
|
||||||
case TABLE -> Color.ORANGE;
|
case TABLE -> Color.ORANGE;
|
||||||
case TABLE_CELL -> Color.GRAY;
|
case TABLE_CELL -> Color.GRAY;
|
||||||
case IMAGE -> Color.MAGENTA;
|
case IMAGE -> Color.MAGENTA;
|
||||||
|
|||||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Loading…
x
Reference in New Issue
Block a user